diff --git a/src/USER-OMP/Install.sh b/src/USER-OMP/Install.sh
new file mode 100644
index 0000000000..a290fb47bd
--- /dev/null
+++ b/src/USER-OMP/Install.sh
@@ -0,0 +1,51 @@
+# Install/unInstall package files in LAMMPS
+# do not install child files if parent does not exist
+
+for file in *_omp.cpp *_omp.h  pppm*proxy.h pppm*proxy.cpp; do
+    # let us see if the "rain man" can count the toothpicks...
+   ofile=`echo $file | sed  -e s,_pppm_tip4p_omp,_long_tip4p_omp, \
+   -e s,pppm.\\*_proxy,pppm_omp, -e s,_pppm_omp,_long_omp, \
+   -e s,\\\\\\(.\\*\\\\\\)_omp\\\\.h,\\\\1.h, \
+   -e s,\\\\\\(.\\*\\\\\\)_omp\\\\.cpp,\\\\1.cpp,`
+  if (test $1 = 1) then
+    if (test $file = "thr_omp.h") || (test $file = "thr_omp.cpp") then
+      :  # always install those files.
+    elif (test ! -e ../$ofile) then
+      continue
+    fi
+
+    cp $file ..
+
+  elif (test $1 = 0) then
+    rm -f ../$file
+  fi
+done
+
+if (test $1 = 1) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*OMP[^ \t]* //' ../Makefile.package
+    sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_USER_OMP |' ../Makefile.package
+  fi
+
+  # force rebuild of files with LMP_USER_OMP switch
+
+  touch ../accelerator_omp.h
+
+  cp thr_data.h ..
+  cp thr_data.cpp ..
+
+elif (test $1 = 0) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*OMP[^ \t]* //' ../Makefile.package
+  fi
+
+  # force rebuild of files with LMP_USER_OMP switch
+
+  touch ../accelerator_omp.h
+
+  rm -f ../thr_data.h
+  rm -f ../thr_data.cpp
+
+fi
diff --git a/src/USER-OMP/Package.sh b/src/USER-OMP/Package.sh
new file mode 100644
index 0000000000..6f577b2791
--- /dev/null
+++ b/src/USER-OMP/Package.sh
@@ -0,0 +1,46 @@
+# Update package files in LAMMPS
+# copy package file to src if it doesn't exists or is different
+# do not copy OpenMP style files, if a non-OpenMP version does 
+# not exist. Do remove OpenMP style files that have no matching
+# non-OpenMP version installed, e.g. after a package has been
+# removed
+for file in *_omp.cpp *_omp.h pppm*proxy.h pppm*proxy.cpp thr_data.h thr_data.cpp; do
+  # let us see if the "rain man" can count the toothpicks...
+   ofile=`echo $file | sed  -e s,_pppm_tip4p_omp,_long_tip4p_omp, \
+   -e s,pppm.\\*_proxy,pppm_omp, -e s,_pppm_omp,_long_omp, \
+   -e s,\\\\\\(.\\*\\\\\\)_omp\\\\.\\\\\\(h\\\\\\|cpp\\\\\\),\\\\1.\\\\2,`
+  if (test $file = "thr_omp.h") || (test $file = "thr_omp.cpp") \
+      || (test $file = "thr_data.h") || (test $file = "thr_data.cpp") then
+    if (test ! -e ../$file) then
+      echo "  creating src/$file"
+      cp $file ..
+    elif ! cmp -s $file ../$file ; then
+      echo "  updating src/$file"
+      cp $file ..
+    fi
+  elif (test ! -e ../$ofile) then
+    if (test -e ../$file) then
+      echo "  removing src/$file"
+      rm -f ../$file
+    fi
+  else
+    if (test ! -e ../$file) then
+      echo "  creating src/$file"
+      cp $file ..
+    elif ! cmp -s $file ../$file ; then
+      echo "  updating src/$file"
+      cp $file ..
+    fi
+  fi
+done
+
+for file in thr_data.h thr_data.cpp; do
+  if (test ! -e ../$file) then
+    echo "  creating src/$file"
+    cp $file ..
+  elif ! cmp -s $file ../$file ; then
+    echo "  updating src/$file"
+    cp $file ..
+  fi
+done
+
diff --git a/src/USER-OMP/README b/src/USER-OMP/README
new file mode 100644
index 0000000000..46f63f646b
--- /dev/null
+++ b/src/USER-OMP/README
@@ -0,0 +1,19 @@
+This package provides OpenMP multi-threading support and other
+optimizations of various LAMMPS pair styles, dihedral styles, and fix
+styles.
+ 
+See this section of the manual to get started:
+
+doc/Section_accelerate.html, sub-section 5.2
+
+The person who created this package is Axel Kohlmeyer at Temple U
+(akohlmey at gmail.com).  Contact him directly if you have questions.
+
+--------------------------
+
+This directory also contains a shell script:
+
+hack_openmp_for_pgi.sh
+
+which will convert OpenMP directives in src files
+into a form compatible with the PGI compiler.
diff --git a/src/USER-OMP/angle_charmm_omp.cpp b/src/USER-OMP/angle_charmm_omp.cpp
new file mode 100644
index 0000000000..4cc86dd11c
--- /dev/null
+++ b/src/USER-OMP/angle_charmm_omp.cpp
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_charmm_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+ AngleCharmmOMP::AngleCharmmOMP(class LAMMPS *lmp)
+  : AngleCharmm(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCharmmOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleCharmmOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double dtheta,tk;
+  double rsq1,rsq2,r1,r2,c,s,a,a11,a12,a22;
+  double delxUB,delyUB,delzUB,rsqUB,rUB,dr,rk,forceUB;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // Urey-Bradley bond
+
+    delxUB = x[i3][0] - x[i1][0];
+    delyUB = x[i3][1] - x[i1][1];
+    delzUB = x[i3][2] - x[i1][2];
+
+    rsqUB = delxUB*delxUB + delyUB*delyUB + delzUB*delzUB;
+    rUB = sqrt(rsqUB);
+
+    // Urey-Bradley force & energy
+
+    dr = rUB - r_ub[type];
+    rk = k_ub[type] * dr;
+
+    if (rUB > 0.0) forceUB = -2.0*rk/rUB;
+    else forceUB = 0.0;
+
+    if (EFLAG) eangle = rk*dr;
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    // harmonic force & energy
+
+    dtheta = acos(c) - theta0[type];
+    tk = k[type] * dtheta;
+
+    if (EFLAG) eangle += tk*dtheta;
+
+    a = -2.0 * tk * s;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2 - delxUB*forceUB;
+    f1[1] = a11*dely1 + a12*dely2 - delyUB*forceUB;
+    f1[2] = a11*delz1 + a12*delz2 - delzUB*forceUB;
+
+    f3[0] = a22*delx2 + a12*delx1 + delxUB*forceUB;
+    f3[1] = a22*dely2 + a12*dely1 + delyUB*forceUB;
+    f3[2] = a22*delz2 + a12*delz1 + delzUB*forceUB;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_charmm_omp.h b/src/USER-OMP/angle_charmm_omp.h
new file mode 100644
index 0000000000..8cbb992c3b
--- /dev/null
+++ b/src/USER-OMP/angle_charmm_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(charmm/omp,AngleCharmmOMP)
+
+#else
+
+#ifndef LMP_ANGLE_CHARMM_OMP_H
+#define LMP_ANGLE_CHARMM_OMP_H
+
+#include "angle_charmm.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleCharmmOMP : public AngleCharmm, public ThrOMP {
+
+ public:
+  AngleCharmmOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_class2_omp.cpp b/src/USER-OMP/angle_class2_omp.cpp
new file mode 100644
index 0000000000..ef887a93e2
--- /dev/null
+++ b/src/USER-OMP/angle_class2_omp.cpp
@@ -0,0 +1,240 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_class2_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleClass2OMP::AngleClass2OMP(class LAMMPS *lmp)
+  : AngleClass2(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleClass2OMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleClass2OMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double dtheta,dtheta2,dtheta3,dtheta4,de_angle;
+  double dr1,dr2,tk1,tk2,aa1,aa2,aa11,aa12,aa21,aa22;
+  double rsq1,rsq2,r1,r2,c,s,a,a11,a12,a22,b1,b2;
+  double vx11,vx12,vy11,vy12,vz11,vz12,vx21,vx22,vy21,vy22,vz21,vz22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    // force & energy for angle term
+
+    dtheta = acos(c) - theta0[type];
+    dtheta2 = dtheta*dtheta;
+    dtheta3 = dtheta2*dtheta;
+    dtheta4 = dtheta3*dtheta;
+
+    de_angle = 2.0*k2[type]*dtheta + 3.0*k3[type]*dtheta2 +
+      4.0*k4[type]*dtheta3;
+
+    a = -de_angle*s;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    if (EFLAG) eangle = k2[type]*dtheta2 + k3[type]*dtheta3 + k4[type]*dtheta4;
+
+    // force & energy for bond-bond term
+
+    dr1 = r1 - bb_r1[type];
+    dr2 = r2 - bb_r2[type];
+    tk1 = bb_k[type] * dr1;
+    tk2 = bb_k[type] * dr2;
+
+    f1[0] -= delx1*tk2/r1;
+    f1[1] -= dely1*tk2/r1;
+    f1[2] -= delz1*tk2/r1;
+
+    f3[0] -= delx2*tk1/r2;
+    f3[1] -= dely2*tk1/r2;
+    f3[2] -= delz2*tk1/r2;
+
+    if (EFLAG) eangle += bb_k[type]*dr1*dr2;
+
+    // force & energy for bond-angle term
+
+    aa1 = s * dr1 * ba_k1[type];
+    aa2 = s * dr2 * ba_k2[type];
+
+    aa11 = aa1 * c / rsq1;
+    aa12 = -aa1 / (r1 * r2);
+    aa21 = aa2 * c / rsq1;
+    aa22 = -aa2 / (r1 * r2);
+
+    vx11 = (aa11 * delx1) + (aa12 * delx2);
+    vx12 = (aa21 * delx1) + (aa22 * delx2);
+    vy11 = (aa11 * dely1) + (aa12 * dely2);
+    vy12 = (aa21 * dely1) + (aa22 * dely2);
+    vz11 = (aa11 * delz1) + (aa12 * delz2);
+    vz12 = (aa21 * delz1) + (aa22 * delz2);
+
+    aa11 = aa1 * c / rsq2;
+    aa21 = aa2 * c / rsq2;
+
+    vx21 = (aa11 * delx2) + (aa12 * delx1);
+    vx22 = (aa21 * delx2) + (aa22 * delx1);
+    vy21 = (aa11 * dely2) + (aa12 * dely1);
+    vy22 = (aa21 * dely2) + (aa22 * dely1);
+    vz21 = (aa11 * delz2) + (aa12 * delz1);
+    vz22 = (aa21 * delz2) + (aa22 * delz1);
+
+    b1 = ba_k1[type] * dtheta / r1;
+    b2 = ba_k2[type] * dtheta / r2;
+
+    f1[0] -= vx11 + b1*delx1 + vx12;
+    f1[1] -= vy11 + b1*dely1 + vy12;
+    f1[2] -= vz11 + b1*delz1 + vz12;
+
+    f3[0] -= vx21 + b2*delx2 + vx22;
+    f3[1] -= vy21 + b2*dely2 + vy22;
+    f3[2] -= vz21 + b2*delz2 + vz22;
+
+    if (EFLAG) eangle += ba_k1[type]*dr1*dtheta + ba_k2[type]*dr2*dtheta;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_class2_omp.h b/src/USER-OMP/angle_class2_omp.h
new file mode 100644
index 0000000000..fcfdae51a7
--- /dev/null
+++ b/src/USER-OMP/angle_class2_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(class2/omp,AngleClass2OMP)
+
+#else
+
+#ifndef LMP_ANGLE_CLASS2_OMP_H
+#define LMP_ANGLE_CLASS2_OMP_H
+
+#include "angle_class2.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleClass2OMP : public AngleClass2, public ThrOMP {
+
+ public:
+  AngleClass2OMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_cosine_delta_omp.cpp b/src/USER-OMP/angle_cosine_delta_omp.cpp
new file mode 100644
index 0000000000..caf259b15c
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_delta_omp.cpp
@@ -0,0 +1,189 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_cosine_delta_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleCosineDeltaOMP::AngleCosineDeltaOMP(class LAMMPS *lmp)
+ : AngleCosineDelta(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCosineDeltaOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleCosineDeltaOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2,theta,dtheta,dcostheta,tk;
+  double eangle,f1[3],f3[3];
+  double rsq1,rsq2,r1,r2,c,a,cot,a11,a12,a22,b11,b12,b22,c0,s0,s;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    theta = acos(c);
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    cot = c/s;
+
+    // force & energy
+
+    dtheta = theta - theta0[type];
+    dcostheta = cos(dtheta);
+    tk = k[type] * (1.0-dcostheta);
+
+    if (EFLAG) eangle = tk;
+
+    a = -k[type];
+
+    // expand dtheta for cos and sin contribution to force
+
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    b11 = -a*c*cot / rsq1;
+    b12 = a*cot / (r1*r2);
+    b22 = -a*c*cot / rsq2;
+
+    c0 = cos(theta0[type]);
+    s0 = sin(theta0[type]);
+
+    f1[0] = (a11*delx1 + a12*delx2)*c0 + (b11*delx1 + b12*delx2)*s0;
+    f1[1] = (a11*dely1 + a12*dely2)*c0 + (b11*dely1 + b12*dely2)*s0;
+    f1[2] = (a11*delz1 + a12*delz2)*c0 + (b11*delz1 + b12*delz2)*s0;
+    f3[0] = (a22*delx2 + a12*delx1)*c0 + (b22*delx2 + b12*delx1)*s0;
+    f3[1] = (a22*dely2 + a12*dely1)*c0 + (b22*dely2 + b12*dely1)*s0;
+    f3[2] = (a22*delz2 + a12*delz1)*c0 + (b22*delz2 + b12*delz1)*s0;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_cosine_delta_omp.h b/src/USER-OMP/angle_cosine_delta_omp.h
new file mode 100644
index 0000000000..57b1d9364d
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_delta_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(cosine/delta/omp,AngleCosineDeltaOMP)
+
+#else
+
+#ifndef LMP_ANGLE_COSINE_DELTA_OMP_H
+#define LMP_ANGLE_COSINE_DELTA_OMP_H
+
+#include "angle_cosine_delta.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleCosineDeltaOMP : public AngleCosineDelta, public ThrOMP {
+
+ public:
+  AngleCosineDeltaOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_cosine_omp.cpp b/src/USER-OMP/angle_cosine_omp.cpp
new file mode 100644
index 0000000000..51d3494a4f
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_omp.cpp
@@ -0,0 +1,166 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_cosine_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleCosineOMP::AngleCosineOMP(class LAMMPS *lmp)
+  : AngleCosine(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCosineOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleCosineOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double rsq1,rsq2,r1,r2,c,a,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // c = cosine of angle
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+
+    if (EFLAG) eangle = k[type]*(1.0+c);
+
+    a = k[type];
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_cosine_omp.h b/src/USER-OMP/angle_cosine_omp.h
new file mode 100644
index 0000000000..16eb3ff944
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(cosine/omp,AngleCosineOMP)
+
+#else
+
+#ifndef LMP_ANGLE_COSINE_OMP_H
+#define LMP_ANGLE_COSINE_OMP_H
+
+#include "angle_cosine.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleCosineOMP : public AngleCosine, public ThrOMP {
+
+ public:
+  AngleCosineOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_cosine_periodic_omp.cpp b/src/USER-OMP/angle_cosine_periodic_omp.cpp
new file mode 100644
index 0000000000..6c5932e70c
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_periodic_omp.cpp
@@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_cosine_periodic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleCosinePeriodicOMP::AngleCosinePeriodicOMP(class LAMMPS *lmp)
+  : AngleCosinePeriodic(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCosinePeriodicOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleCosinePeriodicOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i,i1,i2,i3,n,m,type,b_factor;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double rsq1,rsq2,r1,r2,c,s,a,a11,a12,a22;
+  double tn,tn_1,tn_2,un,un_1,un_2;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // c = cosine of angle
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    m = multiplicity[type];
+    b_factor = b[type];
+
+    // cos(n*x) = Tn(cos(x))
+    // Tn(x) = Chebyshev polynomials of the first kind: T_0 = 1, T_1 = x, ...
+    // recurrence relationship:
+    // Tn(x) = 2*x*T[n-1](x) - T[n-2](x) where T[-1](x) = 0
+    // also, dTn(x)/dx = n*U[n-1](x)
+    // where Un(x) = 2*x*U[n-1](x) - U[n-2](x) and U[-1](x) = 0
+    // finally need to handle special case for n = 1
+
+    tn = 1.0;
+    tn_1 = 1.0;
+    tn_2 = 0.0;
+    un = 1.0;
+    un_1 = 2.0;
+    un_2 = 0.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    // force & energy
+
+    tn_2 = c;
+    for (i = 1; i <= m; i++) {
+      tn = 2*c*tn_1 - tn_2;
+      tn_2 = tn_1;
+      tn_1 = tn;
+    }
+
+    for (i = 2; i <= m; i++) {
+      un = 2*c*un_1 - un_2;
+      un_2 = un_1;
+      un_1 = un;
+    }
+    tn = b_factor*pow(-1.0,(double)m)*tn;
+    un = b_factor*pow(-1.0,(double)m)*m*un;
+
+    if (EFLAG) eangle = 2*k[type]*(1.0 - tn);
+
+    a = -k[type]*un;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_cosine_periodic_omp.h b/src/USER-OMP/angle_cosine_periodic_omp.h
new file mode 100644
index 0000000000..3c7da10ceb
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_periodic_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(cosine/periodic/omp,AngleCosinePeriodicOMP)
+
+#else
+
+#ifndef LMP_ANGLE_COSINE_PERIODIC_OMP_H
+#define LMP_ANGLE_COSINE_PERIODIC_OMP_H
+
+#include "angle_cosine_periodic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleCosinePeriodicOMP : public AngleCosinePeriodic, public ThrOMP {
+
+ public:
+  AngleCosinePeriodicOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_cosine_shift_exp_omp.cpp b/src/USER-OMP/angle_cosine_shift_exp_omp.cpp
new file mode 100644
index 0000000000..18af91cf7d
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_shift_exp_omp.cpp
@@ -0,0 +1,187 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_cosine_shift_exp_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleCosineShiftExpOMP::AngleCosineShiftExpOMP(class LAMMPS *lmp)
+  : AngleCosineShiftExp(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCosineShiftExpOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleCosineShiftExpOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3],ff;
+  double rsq1,rsq2,r1,r2,c,s,a11,a12,a22;
+  double exp2,aa,uumin,cccpsss,cssmscc;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // c = cosine of angle
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // C= sine of angle
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+
+    // force & energy
+
+    aa=a[type];
+    uumin=umin[type];
+
+    cccpsss = c*cost[type]+s*sint[type];
+    cssmscc = c*sint[type]-s*cost[type];
+
+    if (doExpansion[type])
+       {  //  |a|<0.01 so use expansions relative precision <1e-5
+//         std::cout << "Using expansion\n";
+            if (EFLAG) eangle = -0.125*(1+cccpsss)*(4+aa*(cccpsss-1))*uumin;
+            ff=0.25*uumin*cssmscc*(2+aa*cccpsss)/s;
+       }
+     else
+       {
+//   std::cout << "Not using expansion\n";
+            exp2=exp(0.5*aa*(1+cccpsss));
+            if (EFLAG) eangle = opt1[type]*(1-exp2);
+            ff=0.5*a[type]*opt1[type]*exp2*cssmscc/s;
+       }
+
+    a11 =   ff*c/ rsq1;
+    a12 =  -ff  / (r1*r2);
+    a22 =   ff*c/ rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_cosine_shift_exp_omp.h b/src/USER-OMP/angle_cosine_shift_exp_omp.h
new file mode 100644
index 0000000000..aba70f8a3c
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_shift_exp_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(cosine/shift/exp/omp,AngleCosineShiftExpOMP)
+
+#else
+
+#ifndef LMP_ANGLE_COSINE_SHIFT_EXP_OMP_H
+#define LMP_ANGLE_COSINE_SHIFT_EXP_OMP_H
+
+#include "angle_cosine_shift_exp.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleCosineShiftExpOMP : public AngleCosineShiftExp, public ThrOMP {
+
+ public:
+  AngleCosineShiftExpOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_cosine_shift_omp.cpp b/src/USER-OMP/angle_cosine_shift_omp.cpp
new file mode 100644
index 0000000000..a7f40fee8a
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_shift_omp.cpp
@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_cosine_shift_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleCosineShiftOMP::AngleCosineShiftOMP(class LAMMPS *lmp)
+  : AngleCosineShift(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCosineShiftOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleCosineShiftOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double f1[3],f3[3];
+  double rsq1,rsq2,r1,r2,c,s,cps,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+  double eangle = 0.0;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // c = cosine of angle
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // C= sine of angle
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+
+    // force & energy
+    const double kcos=kcost[type];
+    const double ksin=ksint[type];
+    if (EFLAG) eangle = -k[type]-kcos*c-ksin*s;
+
+    cps = c/s;          // NOTE absorbed one c
+
+    a11 = (-kcos +ksin*cps )*c/ rsq1;
+    a12 = ( kcos -ksin*cps )  / (r1*r2);
+    a22 = (-kcos +ksin*cps )*c/ rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_cosine_shift_omp.h b/src/USER-OMP/angle_cosine_shift_omp.h
new file mode 100644
index 0000000000..09402fb341
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_shift_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(cosine/shift/omp,AngleCosineShiftOMP)
+
+#else
+
+#ifndef LMP_ANGLE_COSINE_SHIFT_OMP_H
+#define LMP_ANGLE_COSINE_SHIFT_OMP_H
+
+#include "angle_cosine_shift.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleCosineShiftOMP : public AngleCosineShift, public ThrOMP {
+
+ public:
+  AngleCosineShiftOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_cosine_squared_omp.cpp b/src/USER-OMP/angle_cosine_squared_omp.cpp
new file mode 100644
index 0000000000..e8eaff6b2a
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_squared_omp.cpp
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_cosine_squared_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleCosineSquaredOMP::AngleCosineSquaredOMP(class LAMMPS *lmp)
+  : AngleCosineSquared(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCosineSquaredOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleCosineSquaredOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double dcostheta,tk;
+  double rsq1,rsq2,r1,r2,c,a,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+
+    dcostheta = c - cos(theta0[type]);
+    tk = k[type] * dcostheta;
+
+    if (EFLAG) eangle = tk*dcostheta;
+
+    a = 2.0 * tk;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_cosine_squared_omp.h b/src/USER-OMP/angle_cosine_squared_omp.h
new file mode 100644
index 0000000000..758863e612
--- /dev/null
+++ b/src/USER-OMP/angle_cosine_squared_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(cosine/squared/omp,AngleCosineSquaredOMP)
+
+#else
+
+#ifndef LMP_ANGLE_COSINE_SQUARED_OMP_H
+#define LMP_ANGLE_COSINE_SQUARED_OMP_H
+
+#include "angle_cosine_squared.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleCosineSquaredOMP : public AngleCosineSquared, public ThrOMP {
+
+ public:
+  AngleCosineSquaredOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_dipole_omp.cpp b/src/USER-OMP/angle_dipole_omp.cpp
new file mode 100644
index 0000000000..025ffcd16c
--- /dev/null
+++ b/src/USER-OMP/angle_dipole_omp.cpp
@@ -0,0 +1,125 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_dipole_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleDipoleOMP::AngleDipoleOMP(class LAMMPS *lmp)
+  : AngleDipole(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleDipoleOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  if (!force->newton_bond)
+    error->all(FLERR,"'newton' flag for bonded interactions must be 'on'");
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (eflag)
+      eval<1>(ifrom, ito, thr);
+    else
+      eval<0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+
+}
+
+template <int EFLAG>
+void AngleDipoleOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int iRef,iDip,iDummy,n,type;
+  double delx,dely,delz;
+  double eangle,tangle;
+  double r,cosGamma,deltaGamma,kdg,rmu;
+
+  const double * const * const x = atom->x;   // position vector
+  const double * const * const mu = atom->mu; // point-dipole components and moment magnitude
+  double * const * const torque = thr->get_torque();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+  const double f1[3] = {0.0, 0.0, 0.0};
+  const double f3[3] = {0.0, 0.0, 0.0};
+
+  for (n = nfrom; n < nto; n++) {
+    iDip = anglelist[n][0]; // dipole whose orientation is to be restrained
+    iRef = anglelist[n][1]; // reference atom toward which dipole will point
+    iDummy = anglelist[n][2]; // dummy atom - irrelevant to the interaction
+    type = anglelist[n][3];
+
+    delx = x[iRef][0] - x[iDip][0];
+    dely = x[iRef][1] - x[iDip][1];
+    delz = x[iRef][2] - x[iDip][2];
+
+    r = sqrt(delx*delx + dely*dely + delz*delz);
+
+    rmu = r * mu[iDip][3];
+    cosGamma = (mu[iDip][0]*delx+mu[iDip][1]*dely+mu[iDip][2]*delz) / rmu;
+    deltaGamma = cosGamma - cos(gamma0[type]);
+    kdg = k[type] * deltaGamma;
+
+    if (EFLAG) eangle = kdg * deltaGamma; // energy
+
+    tangle = 2.0 * kdg / rmu;
+
+    torque[iDip][0] += tangle * (dely*mu[iDip][2] - delz*mu[iDip][1]);
+    torque[iDip][1] += tangle * (delz*mu[iDip][0] - delx*mu[iDip][2]);
+    torque[iDip][2] += tangle * (delx*mu[iDip][1] - dely*mu[iDip][0]);
+
+    if (EFLAG) // tally energy (virial=0 because force=0)
+      ev_tally_thr(this,iRef,iDip,iDummy,nlocal,/* NEWTON_BOND */ 1,
+                   eangle,f1,f3,0.0,0.0,0.0,0.0,0.0,0.0,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_dipole_omp.h b/src/USER-OMP/angle_dipole_omp.h
new file mode 100644
index 0000000000..ab7133da34
--- /dev/null
+++ b/src/USER-OMP/angle_dipole_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(dipole/omp,AngleDipoleOMP)
+
+#else
+
+#ifndef LMP_ANGLE_DIPOLE_OMP_H
+#define LMP_ANGLE_DIPOLE_OMP_H
+
+#include "angle_dipole.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleDipoleOMP : public AngleDipole, public ThrOMP {
+
+ public:
+  AngleDipoleOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EFLAG>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_harmonic_omp.cpp b/src/USER-OMP/angle_harmonic_omp.cpp
new file mode 100644
index 0000000000..c30f830b77
--- /dev/null
+++ b/src/USER-OMP/angle_harmonic_omp.cpp
@@ -0,0 +1,175 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_harmonic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleHarmonicOMP::AngleHarmonicOMP(class LAMMPS *lmp)
+  : AngleHarmonic(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleHarmonicOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleHarmonicOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double eangle,f1[3],f3[3];
+  double dtheta,tk;
+  double rsq1,rsq2,r1,r2,c,s,a,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    // force & energy
+
+    dtheta = acos(c) - theta0[type];
+    tk = k[type] * dtheta;
+
+    if (EFLAG) eangle = tk*dtheta;
+
+    a = -2.0 * tk * s;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_harmonic_omp.h b/src/USER-OMP/angle_harmonic_omp.h
new file mode 100644
index 0000000000..80c8f491aa
--- /dev/null
+++ b/src/USER-OMP/angle_harmonic_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(harmonic/omp,AngleHarmonicOMP)
+
+#else
+
+#ifndef LMP_ANGLE_HARMONIC_OMP_H
+#define LMP_ANGLE_HARMONIC_OMP_H
+
+#include "angle_harmonic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleHarmonicOMP : public AngleHarmonic, public ThrOMP {
+
+ public:
+  AngleHarmonicOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_sdk_omp.cpp b/src/USER-OMP/angle_sdk_omp.cpp
new file mode 100644
index 0000000000..bd780e0fe0
--- /dev/null
+++ b/src/USER-OMP/angle_sdk_omp.cpp
@@ -0,0 +1,230 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_sdk_omp.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "force.h"
+#include "math_const.h"
+
+#include <math.h>
+
+#include "lj_sdk_common.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace LJSDKParms;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleSDKOMP::AngleSDKOMP(class LAMMPS *lmp)
+  : AngleSDK(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleSDKOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleSDKOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double delx1,dely1,delz1,delx2,dely2,delz2,delx3,dely3,delz3;
+  double eangle,f1[3],f3[3],e13,f13;
+  double dtheta,tk;
+  double rsq1,rsq2,rsq3,r1,r2,c,s,a,a11,a12,a22;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    // 1-3 LJ interaction.
+    // we only want to use the repulsive part,
+    // and it can be scaled (or off).
+    // so this has to be done here and not in the
+    // general non-bonded code.
+
+    f13 = e13 = delx3 = dely3 = delz3 = 0.0;
+
+    if (repflag) {
+
+      delx3 = x[i1][0] - x[i3][0];
+      dely3 = x[i1][1] - x[i3][1];
+      delz3 = x[i1][2] - x[i3][2];
+      rsq3 = delx3*delx3 + dely3*dely3 + delz3*delz3;
+
+      const int type1 = atom->type[i1];
+      const int type3 = atom->type[i3];
+
+      if (rsq3 < rminsq[type1][type3]) {
+        const int ljt = lj_type[type1][type3];
+        const double r2inv = 1.0/rsq3;
+
+        if (ljt == LJ12_4) {
+          const double r4inv=r2inv*r2inv;
+
+          f13 = r4inv*(lj1[type1][type3]*r4inv*r4inv - lj2[type1][type3]);
+          if (EFLAG) e13 = r4inv*(lj3[type1][type3]*r4inv*r4inv - lj4[type1][type3]);
+
+        } else if (ljt == LJ9_6) {
+          const double r3inv = r2inv*sqrt(r2inv);
+          const double r6inv = r3inv*r3inv;
+
+          f13 = r6inv*(lj1[type1][type3]*r3inv - lj2[type1][type3]);
+          if (EFLAG) e13 = r6inv*(lj3[type1][type3]*r3inv - lj4[type1][type3]);
+
+        } else if (ljt == LJ12_6) {
+          const double r6inv = r2inv*r2inv*r2inv;
+
+          f13 = r6inv*(lj1[type1][type3]*r6inv - lj2[type1][type3]);
+          if (EFLAG) e13 = r6inv*(lj3[type1][type3]*r6inv - lj4[type1][type3]);
+        }
+
+        // make sure energy is 0.0 at the cutoff.
+        if (EFLAG) e13 -= emin[type1][type3];
+
+        f13 *= r2inv;
+      }
+    }
+
+    // force & energy
+
+    dtheta = acos(c) - theta0[type];
+    tk = k[type] * dtheta;
+
+    if (EFLAG) eangle = tk*dtheta;
+
+    a = -2.0 * tk * s;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of the 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0] + f13*delx3;
+      f[i1][1] += f1[1] + f13*dely3;
+      f[i1][2] += f1[2] + f13*delz3;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0] - f13*delx3;
+      f[i3][1] += f3[1] - f13*dely3;
+      f[i3][2] += f3[2] - f13*delz3;
+    }
+
+    if (EVFLAG) {
+      ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                   delx1,dely1,delz1,delx2,dely2,delz2,thr);
+      if (repflag) ev_tally13_thr(this,i1,i3,nlocal,NEWTON_BOND,
+                                  e13,f13,delx3,dely3,delz3,thr);
+    }
+  }
+}
diff --git a/src/USER-OMP/angle_sdk_omp.h b/src/USER-OMP/angle_sdk_omp.h
new file mode 100644
index 0000000000..9ab75904ce
--- /dev/null
+++ b/src/USER-OMP/angle_sdk_omp.h
@@ -0,0 +1,47 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(sdk/omp,AngleSDKOMP)
+AngleStyle(cg/cmm/omp,AngleSDKOMP)
+
+#else
+
+#ifndef LMP_ANGLE_SDK_OMP_H
+#define LMP_ANGLE_SDK_OMP_H
+
+#include "angle_sdk.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleSDKOMP : public AngleSDK, public ThrOMP {
+
+ public:
+  AngleSDKOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/angle_table_omp.cpp b/src/USER-OMP/angle_table_omp.cpp
new file mode 100644
index 0000000000..15c32c8943
--- /dev/null
+++ b/src/USER-OMP/angle_table_omp.cpp
@@ -0,0 +1,175 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "angle_table_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include "math_const.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+AngleTableOMP::AngleTableOMP(class LAMMPS *lmp)
+  : AngleTable(lmp), ThrOMP(lmp,THR_ANGLE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleTableOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nanglelist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void AngleTableOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,n,type;
+  double eangle,f1[3],f3[3];
+  double delx1,dely1,delz1,delx2,dely2,delz2;
+  double rsq1,rsq2,r1,r2,c,s,a,a11,a12,a22;
+  double theta,u,mdu; //mdu: minus du, -du/dx=f
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const anglelist = neighbor->anglelist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = anglelist[n][0];
+    i2 = anglelist[n][1];
+    i3 = anglelist[n][2];
+    type = anglelist[n][3];
+
+    // 1st bond
+
+    delx1 = x[i1][0] - x[i2][0];
+    dely1 = x[i1][1] - x[i2][1];
+    delz1 = x[i1][2] - x[i2][2];
+
+    rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+    r1 = sqrt(rsq1);
+
+    // 2nd bond
+
+    delx2 = x[i3][0] - x[i2][0];
+    dely2 = x[i3][1] - x[i2][1];
+    delz2 = x[i3][2] - x[i2][2];
+
+    rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+    r2 = sqrt(rsq2);
+
+    // angle (cos and sin)
+
+    c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+    c /= r1*r2;
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    s = 1.0/s;
+
+    // tabulated force & energy
+
+    theta = acos(c);
+    uf_lookup(type,theta,u,mdu);
+
+    if (EFLAG) eangle = u;
+
+    a = mdu * s;
+    a11 = a*c / rsq1;
+    a12 = -a / (r1*r2);
+    a22 = a*c / rsq2;
+
+    f1[0] = a11*delx1 + a12*delx2;
+    f1[1] = a11*dely1 + a12*dely2;
+    f1[2] = a11*delz1 + a12*delz2;
+    f3[0] = a22*delx2 + a12*delx1;
+    f3[1] = a22*dely2 + a12*dely1;
+    f3[2] = a22*delz2 + a12*delz1;
+
+    // apply force to each of 3 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= f1[0] + f3[0];
+      f[i2][1] -= f1[1] + f3[1];
+      f[i2][2] -= f1[2] + f3[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,i3,nlocal,NEWTON_BOND,eangle,f1,f3,
+                             delx1,dely1,delz1,delx2,dely2,delz2,thr);
+  }
+}
diff --git a/src/USER-OMP/angle_table_omp.h b/src/USER-OMP/angle_table_omp.h
new file mode 100644
index 0000000000..97c9bb65a7
--- /dev/null
+++ b/src/USER-OMP/angle_table_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(table/omp,AngleTableOMP)
+
+#else
+
+#ifndef LMP_ANGLE_TABLE_OMP_H
+#define LMP_ANGLE_TABLE_OMP_H
+
+#include "angle_table.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class AngleTableOMP : public AngleTable, public ThrOMP {
+
+ public:
+  AngleTableOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_class2_omp.cpp b/src/USER-OMP/bond_class2_omp.cpp
new file mode 100644
index 0000000000..580d9b9804
--- /dev/null
+++ b/src/USER-OMP/bond_class2_omp.cpp
@@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_class2_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+
+/* ---------------------------------------------------------------------- */
+
+BondClass2OMP::BondClass2OMP(class LAMMPS *lmp)
+  : BondClass2(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondClass2OMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondClass2OMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r,dr,dr2,dr3,dr4,de_bond;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+    dr = r - r0[type];
+    dr2 = dr*dr;
+    dr3 = dr2*dr;
+    dr4 = dr3*dr;
+
+    // force & energy
+
+    de_bond = 2.0*k2[type]*dr + 3.0*k3[type]*dr2 + 4.0*k4[type]*dr3;
+    if (r > 0.0) fbond = -de_bond/r;
+    else fbond = 0.0;
+
+    if (EFLAG) ebond = k2[type]*dr2 + k3[type]*dr3 + k4[type]*dr4;
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_class2_omp.h b/src/USER-OMP/bond_class2_omp.h
new file mode 100644
index 0000000000..845e28f592
--- /dev/null
+++ b/src/USER-OMP/bond_class2_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(class2/omp,BondClass2OMP)
+
+#else
+
+#ifndef LMP_BOND_CLASS2_OMP_H
+#define LMP_BOND_CLASS2_OMP_H
+
+#include "bond_class2.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondClass2OMP : public BondClass2, public ThrOMP {
+
+ public:
+  BondClass2OMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_fene_expand_omp.cpp b/src/USER-OMP/bond_fene_expand_omp.cpp
new file mode 100644
index 0000000000..4706ef8232
--- /dev/null
+++ b/src/USER-OMP/bond_fene_expand_omp.cpp
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_fene_expand_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "error.h"
+#include "update.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondFENEExpandOMP::BondFENEExpandOMP(class LAMMPS *lmp)
+  : BondFENEExpand(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondFENEExpandOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondFENEExpandOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r0sq,rlogarg,sr2,sr6;
+  double r,rshift,rshiftsq;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+  const int tid = thr->get_tid();
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+    rshift = r - shift[type];
+    rshiftsq = rshift*rshift;
+    r0sq = r0[type] * r0[type];
+    rlogarg = 1.0 - rshiftsq/r0sq;
+
+    // if r -> r0, then rlogarg < 0.0 which is an error
+    // issue a warning and reset rlogarg = epsilon
+    // if r > 2*r0 something serious is wrong, abort
+
+    if (rlogarg < 0.1) {
+      char str[128];
+
+      sprintf(str,"FENE bond too long: " BIGINT_FORMAT " %d %d %g",
+              update->ntimestep,atom->tag[i1],atom->tag[i2],sqrt(rsq));
+      error->warning(FLERR,str,0);
+
+      if (check_error_thr((rlogarg <= -3.0),tid,FLERR,"Bad FENE bond"))
+        return;
+
+      rlogarg = 0.1;
+    }
+
+    fbond = -k[type]*rshift/rlogarg/r;
+
+    // force from LJ term
+
+    if (rshiftsq < TWO_1_3*sigma[type]*sigma[type]) {
+      sr2 = sigma[type]*sigma[type]/rshiftsq;
+      sr6 = sr2*sr2*sr2;
+      fbond += 48.0*epsilon[type]*sr6*(sr6-0.5)/rshift/r;
+    }
+
+    // energy
+
+    if (EFLAG) {
+      ebond = -0.5 * k[type]*r0sq*log(rlogarg);
+      if (rshiftsq < TWO_1_3*sigma[type]*sigma[type])
+        ebond += 4.0*epsilon[type]*sr6*(sr6-1.0) + epsilon[type];
+    }
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_fene_expand_omp.h b/src/USER-OMP/bond_fene_expand_omp.h
new file mode 100644
index 0000000000..317b81e5de
--- /dev/null
+++ b/src/USER-OMP/bond_fene_expand_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(fene/expand/omp,BondFENEExpandOMP)
+
+#else
+
+#ifndef LMP_BOND_FENE_EXPAND_OMP_H
+#define LMP_BOND_FENE_EXPAND_OMP_H
+
+#include "bond_fene_expand.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondFENEExpandOMP : public BondFENEExpand, public ThrOMP {
+
+ public:
+  BondFENEExpandOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_fene_omp.cpp b/src/USER-OMP/bond_fene_omp.cpp
new file mode 100644
index 0000000000..34f90ea3a0
--- /dev/null
+++ b/src/USER-OMP/bond_fene_omp.cpp
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_fene_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "error.h"
+#include "update.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondFENEOMP::BondFENEOMP(class LAMMPS *lmp)
+  : BondFENE(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondFENEOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondFENEOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r0sq,rlogarg,sr2,sr6;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+  const int tid = thr->get_tid();
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r0sq = r0[type] * r0[type];
+    rlogarg = 1.0 - rsq/r0sq;
+
+    // if r -> r0, then rlogarg < 0.0 which is an error
+    // issue a warning and reset rlogarg = epsilon
+    // if r > 2*r0 something serious is wrong, abort
+
+    if (rlogarg < 0.1) {
+      char str[128];
+
+      sprintf(str,"FENE bond too long: " BIGINT_FORMAT " %d %d %g",
+              update->ntimestep,atom->tag[i1],atom->tag[i2],sqrt(rsq));
+      error->warning(FLERR,str,0);
+
+      if (check_error_thr((rlogarg <= -3.0),tid,FLERR,"Bad FENE bond"))
+        return;
+
+      rlogarg = 0.1;
+    }
+
+    fbond = -k[type]/rlogarg;
+
+    // force from LJ term
+
+    if (rsq < TWO_1_3*sigma[type]*sigma[type]) {
+      sr2 = sigma[type]*sigma[type]/rsq;
+      sr6 = sr2*sr2*sr2;
+      fbond += 48.0*epsilon[type]*sr6*(sr6-0.5)/rsq;
+    }
+
+    // energy
+
+    if (EFLAG) {
+      ebond = -0.5 * k[type]*r0sq*log(rlogarg);
+      if (rsq < TWO_1_3*sigma[type]*sigma[type])
+        ebond += 4.0*epsilon[type]*sr6*(sr6-1.0) + epsilon[type];
+    }
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_fene_omp.h b/src/USER-OMP/bond_fene_omp.h
new file mode 100644
index 0000000000..13e843fe13
--- /dev/null
+++ b/src/USER-OMP/bond_fene_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(fene/omp,BondFENEOMP)
+
+#else
+
+#ifndef LMP_BOND_FENE_OMP_H
+#define LMP_BOND_FENE_OMP_H
+
+#include "bond_fene.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondFENEOMP : public BondFENE, public ThrOMP {
+
+ public:
+  BondFENEOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_harmonic_omp.cpp b/src/USER-OMP/bond_harmonic_omp.cpp
new file mode 100644
index 0000000000..dc8bab1c54
--- /dev/null
+++ b/src/USER-OMP/bond_harmonic_omp.cpp
@@ -0,0 +1,128 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_harmonic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondHarmonicOMP::BondHarmonicOMP(class LAMMPS *lmp)
+  : BondHarmonic(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondHarmonicOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondHarmonicOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r,dr,rk;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+    dr = r - r0[type];
+    rk = k[type] * dr;
+
+    // force & energy
+
+    if (r > 0.0) fbond = -2.0*rk/r;
+    else fbond = 0.0;
+
+    if (EFLAG) ebond = rk*dr;
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_harmonic_omp.h b/src/USER-OMP/bond_harmonic_omp.h
new file mode 100644
index 0000000000..0a81b52f41
--- /dev/null
+++ b/src/USER-OMP/bond_harmonic_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(harmonic/omp,BondHarmonicOMP)
+
+#else
+
+#ifndef LMP_BOND_HARMONIC_OMP_H
+#define LMP_BOND_HARMONIC_OMP_H
+
+#include "bond_harmonic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondHarmonicOMP : public BondHarmonic, public ThrOMP {
+
+ public:
+  BondHarmonicOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_harmonic_shift_cut_omp.cpp b/src/USER-OMP/bond_harmonic_shift_cut_omp.cpp
new file mode 100644
index 0000000000..97f7715bfb
--- /dev/null
+++ b/src/USER-OMP/bond_harmonic_shift_cut_omp.cpp
@@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_harmonic_shift_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondHarmonicShiftCutOMP::BondHarmonicShiftCutOMP(class LAMMPS *lmp)
+  : BondHarmonicShiftCut(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondHarmonicShiftCutOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondHarmonicShiftCutOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r,dr,rk;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+
+    if (r>r1[type]) continue;
+
+    dr = r - r0[type];
+    rk = k[type] * dr;
+
+    // force & energy
+
+    if (r > 0.0) fbond = -2.0*rk/r;
+    else fbond = 0.0;
+
+    if (EFLAG)
+      ebond = k[type]*(dr*dr -(r0[type]-r1[type])*(r0[type]-r1[type]) );
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_harmonic_shift_cut_omp.h b/src/USER-OMP/bond_harmonic_shift_cut_omp.h
new file mode 100644
index 0000000000..f465ff5b3a
--- /dev/null
+++ b/src/USER-OMP/bond_harmonic_shift_cut_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(harmonic/shift/cut/omp,BondHarmonicShiftCutOMP)
+
+#else
+
+#ifndef LMP_BOND_HARMONIC_SHIFT_CUT_OMP_H
+#define LMP_BOND_HARMONIC_SHIFT_CUT_OMP_H
+
+#include "bond_harmonic_shift_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondHarmonicShiftCutOMP : public BondHarmonicShiftCut, public ThrOMP {
+
+ public:
+  BondHarmonicShiftCutOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_harmonic_shift_omp.cpp b/src/USER-OMP/bond_harmonic_shift_omp.cpp
new file mode 100644
index 0000000000..49d985b871
--- /dev/null
+++ b/src/USER-OMP/bond_harmonic_shift_omp.cpp
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_harmonic_shift_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondHarmonicShiftOMP::BondHarmonicShiftOMP(class LAMMPS *lmp)
+  : BondHarmonicShift(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondHarmonicShiftOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondHarmonicShiftOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r,dr,rk;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+    dr = r - r0[type];
+    rk = k[type] * dr;
+
+    // force & energy
+
+    if (r > 0.0) fbond = -2.0*rk/r;
+    else fbond = 0.0;
+
+    if (EFLAG)
+      ebond = k[type]*(dr*dr -(r0[type]-r1[type])*(r0[type]-r1[type]) );
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_harmonic_shift_omp.h b/src/USER-OMP/bond_harmonic_shift_omp.h
new file mode 100644
index 0000000000..58ee6ce23d
--- /dev/null
+++ b/src/USER-OMP/bond_harmonic_shift_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(harmonic/shift/omp,BondHarmonicShiftOMP)
+
+#else
+
+#ifndef LMP_BOND_HARMONIC_SHIFT_OMP_H
+#define LMP_BOND_HARMONIC_SHIFT_OMP_H
+
+#include "bond_harmonic_shift.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondHarmonicShiftOMP : public BondHarmonicShift, public ThrOMP {
+
+ public:
+  BondHarmonicShiftOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_morse_omp.cpp b/src/USER-OMP/bond_morse_omp.cpp
new file mode 100644
index 0000000000..21063c636a
--- /dev/null
+++ b/src/USER-OMP/bond_morse_omp.cpp
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_morse_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondMorseOMP::BondMorseOMP(class LAMMPS *lmp)
+  : BondMorse(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondMorseOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondMorseOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r,dr,ralpha;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+
+    dr = r - r0[type];
+    ralpha = exp(-alpha[type]*dr);
+
+    // force & energy
+
+    if (r > 0.0) fbond = -2.0*d0[type]*alpha[type]*(1-ralpha)*ralpha/r;
+    else fbond = 0.0;
+
+    if (EFLAG) ebond = d0[type]*(1-ralpha)*(1-ralpha);
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_morse_omp.h b/src/USER-OMP/bond_morse_omp.h
new file mode 100644
index 0000000000..09d3a4458d
--- /dev/null
+++ b/src/USER-OMP/bond_morse_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(morse/omp,BondMorseOMP)
+
+#else
+
+#ifndef LMP_BOND_MORSE_OMP_H
+#define LMP_BOND_MORSE_OMP_H
+
+#include "bond_morse.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondMorseOMP : public BondMorse, public ThrOMP {
+
+ public:
+  BondMorseOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_nonlinear_omp.cpp b/src/USER-OMP/bond_nonlinear_omp.cpp
new file mode 100644
index 0000000000..9e61ca3215
--- /dev/null
+++ b/src/USER-OMP/bond_nonlinear_omp.cpp
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_nonlinear_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondNonlinearOMP::BondNonlinearOMP(class LAMMPS *lmp)
+  : BondNonlinear(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondNonlinearOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondNonlinearOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r,dr,drsq,lamdasq,denom,denomsq;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+    dr = r - r0[type];
+    drsq = dr*dr;
+    lamdasq = lamda[type]*lamda[type];
+    denom = lamdasq - drsq;
+    denomsq = denom*denom;
+
+    // force & energy
+
+    fbond = -epsilon[type]/r * 2.0*dr*lamdasq/denomsq;
+    if (EFLAG) ebond = epsilon[type] * drsq / denom;
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_nonlinear_omp.h b/src/USER-OMP/bond_nonlinear_omp.h
new file mode 100644
index 0000000000..fcb0fa325d
--- /dev/null
+++ b/src/USER-OMP/bond_nonlinear_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(nonlinear/omp,BondNonlinearOMP)
+
+#else
+
+#ifndef LMP_BOND_NONLINEAR_OMP_H
+#define LMP_BOND_NONLINEAR_OMP_H
+
+#include "bond_nonlinear.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondNonlinearOMP : public BondNonlinear, public ThrOMP {
+
+ public:
+  BondNonlinearOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_quartic_omp.cpp b/src/USER-OMP/bond_quartic_omp.cpp
new file mode 100644
index 0000000000..79d7621a61
--- /dev/null
+++ b/src/USER-OMP/bond_quartic_omp.cpp
@@ -0,0 +1,197 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_quartic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "pair.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondQuarticOMP::BondQuarticOMP(class LAMMPS *lmp)
+  : BondQuartic(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondQuarticOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+ // insure pair->ev_tally() will use 1-4 virial contribution
+
+  if (vflag_global == 2)
+    force->pair->vflag_either = force->pair->vflag_global = 1;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondQuarticOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,m,type,itype,jtype;
+  double delx,dely,delz,ebond,fbond,evdwl,fpair;
+  double r,rsq,dr,r2,ra,rb,sr2,sr6;
+
+  ebond = evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int * const * const bondlist = neighbor->bondlist;
+  const double * const * const cutsq = force->pair->cutsq;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+
+    // skip bond if already broken
+
+    if (bondlist[n][2] <= 0) continue;
+
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+
+    // if bond breaks, set type to 0
+    //   both in temporary bondlist and permanent bond_type
+    // if this proc owns both atoms,
+    //   negate bond_type twice if other atom stores it
+    // if other proc owns 2nd atom, other proc will also break bond
+
+    if (rsq > rc[type]*rc[type]) {
+      bondlist[n][2] = 0;
+      for (m = 0; m < atom->num_bond[i1]; m++)
+        if (atom->bond_atom[i1][m] == atom->tag[i2])
+          atom->bond_type[i1][m] = 0;
+      if (i2 < atom->nlocal)
+        for (m = 0; m < atom->num_bond[i2]; m++)
+          if (atom->bond_atom[i2][m] == atom->tag[i1])
+            atom->bond_type[i2][m] = 0;
+      continue;
+    }
+
+    // quartic bond
+    // 1st portion is from quartic term
+    // 2nd portion is from LJ term cut at 2^(1/6) with eps = sigma = 1.0
+
+    r = sqrt(rsq);
+    dr = r - rc[type];
+    r2 = dr*dr;
+    ra = dr - b1[type];
+    rb = dr - b2[type];
+    fbond = -k[type]/r * (r2*(ra+rb) + 2.0*dr*ra*rb);
+
+    if (rsq < TWO_1_3) {
+      sr2 = 1.0/rsq;
+      sr6 = sr2*sr2*sr2;
+      fbond += 48.0*sr6*(sr6-0.5)/rsq;
+    }
+
+    if (EFLAG) {
+      ebond = k[type]*r2*ra*rb + u0[type];
+      if (rsq < TWO_1_3) ebond += 4.0*sr6*(sr6-1.0) + 1.0;
+    }
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,ebond,fbond,delx,dely,delz,thr);
+
+    // subtract out pairwise contribution from 2 atoms via pair->single()
+    // required since special_bond = 1,1,1
+    // tally energy/virial in pair, using newton_bond as newton flag
+
+    itype = atom->type[i1];
+    jtype = atom->type[i2];
+
+    if (rsq < cutsq[itype][jtype]) {
+      evdwl = -force->pair->single(i1,i2,itype,jtype,rsq,1.0,1.0,fpair);
+      fpair = -fpair;
+
+      if (NEWTON_BOND || i1 < nlocal) {
+        f[i1][0] += delx*fpair;
+        f[i1][1] += dely*fpair;
+        f[i1][2] += delz*fpair;
+      }
+      if (NEWTON_BOND || i2 < nlocal) {
+        f[i2][0] -= delx*fpair;
+        f[i2][1] -= dely*fpair;
+        f[i2][2] -= delz*fpair;
+      }
+
+      if (EVFLAG) ev_tally_thr(force->pair,i1,i2,nlocal,NEWTON_BOND,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+  }
+}
diff --git a/src/USER-OMP/bond_quartic_omp.h b/src/USER-OMP/bond_quartic_omp.h
new file mode 100644
index 0000000000..3a249a6cac
--- /dev/null
+++ b/src/USER-OMP/bond_quartic_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(quartic/omp,BondQuarticOMP)
+
+#else
+
+#ifndef LMP_BOND_QUARTIC_OMP_H
+#define LMP_BOND_QUARTIC_OMP_H
+
+#include "bond_quartic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondQuarticOMP : public BondQuartic, public ThrOMP {
+
+ public:
+  BondQuarticOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/bond_table_omp.cpp b/src/USER-OMP/bond_table_omp.cpp
new file mode 100644
index 0000000000..3184991315
--- /dev/null
+++ b/src/USER-OMP/bond_table_omp.cpp
@@ -0,0 +1,126 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "bond_table_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "domain.h"
+
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+BondTableOMP::BondTableOMP(class LAMMPS *lmp)
+  : BondTable(lmp), ThrOMP(lmp,THR_BOND)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondTableOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nbondlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void BondTableOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,n,type;
+  double delx,dely,delz,ebond,fbond;
+  double rsq,r;
+  double u,mdu;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const bondlist = neighbor->bondlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = bondlist[n][0];
+    i2 = bondlist[n][1];
+    type = bondlist[n][2];
+
+    delx = x[i1][0] - x[i2][0];
+    dely = x[i1][1] - x[i2][1];
+    delz = x[i1][2] - x[i2][2];
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    r = sqrt(rsq);
+
+    // force & energy
+
+    uf_lookup(type,r,u,mdu);
+    fbond = mdu/r;
+    ebond = u;
+
+    // apply force to each of 2 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += delx*fbond;
+      f[i1][1] += dely*fbond;
+      f[i1][2] += delz*fbond;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] -= delx*fbond;
+      f[i2][1] -= dely*fbond;
+      f[i2][2] -= delz*fbond;
+    }
+
+    if (EVFLAG) ev_tally_thr(this,i1,i2,nlocal,NEWTON_BOND,
+                             ebond,fbond,delx,dely,delz,thr);
+  }
+}
diff --git a/src/USER-OMP/bond_table_omp.h b/src/USER-OMP/bond_table_omp.h
new file mode 100644
index 0000000000..37e057ee26
--- /dev/null
+++ b/src/USER-OMP/bond_table_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(table/omp,BondTableOMP)
+
+#else
+
+#ifndef LMP_BOND_TABLE_OMP_H
+#define LMP_BOND_TABLE_OMP_H
+
+#include "bond_table.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class BondTableOMP : public BondTable, public ThrOMP {
+
+ public:
+  BondTableOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_charmm_omp.cpp b/src/USER-OMP/dihedral_charmm_omp.cpp
new file mode 100644
index 0000000000..b069af43c5
--- /dev/null
+++ b/src/USER-OMP/dihedral_charmm_omp.cpp
@@ -0,0 +1,322 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "lmptype.h"
+#include "mpi.h"
+#include "math.h"
+#include "dihedral_charmm_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralCharmmOMP::DihedralCharmmOMP(class LAMMPS *lmp)
+  : DihedralCharmm(lmp), ThrOMP(lmp,THR_DIHEDRAL|THR_CHARMM)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralCharmmOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  // insure pair->ev_tally() will use 1-4 virial contribution
+
+  if (weightflag && vflag_global == 2)
+    force->pair->vflag_either = force->pair->vflag_global = 1;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralCharmmOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+
+  int i1,i2,i3,i4,i,m,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double ax,ay,az,bx,by,bz,rasq,rbsq,rgsq,rg,rginv,ra2inv,rb2inv,rabinv;
+  double df,df1,ddf1,fg,hg,fga,hgb,gaa,gbb;
+  double dtfx,dtfy,dtfz,dtgx,dtgy,dtgz,dthx,dthy,dthz;
+  double c,s,p,sx2,sy2,sz2;
+  int itype,jtype;
+  double delx,dely,delz,rsq,r2inv,r6inv;
+  double forcecoul,forcelj,fpair,ecoul,evdwl;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const atomtype = atom->type;
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const double qqrd2e = force->qqrd2e;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c,s calculation
+
+    ax = vb1y*vb2zm - vb1z*vb2ym;
+    ay = vb1z*vb2xm - vb1x*vb2zm;
+    az = vb1x*vb2ym - vb1y*vb2xm;
+    bx = vb3y*vb2zm - vb3z*vb2ym;
+    by = vb3z*vb2xm - vb3x*vb2zm;
+    bz = vb3x*vb2ym - vb3y*vb2xm;
+
+    rasq = ax*ax + ay*ay + az*az;
+    rbsq = bx*bx + by*by + bz*bz;
+    rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+    rg = sqrt(rgsq);
+
+    rginv = ra2inv = rb2inv = 0.0;
+    if (rg > 0) rginv = 1.0/rg;
+    if (rasq > 0) ra2inv = 1.0/rasq;
+    if (rbsq > 0) rb2inv = 1.0/rbsq;
+    rabinv = sqrt(ra2inv*rb2inv);
+
+    c = (ax*bx + ay*by + az*bz)*rabinv;
+    s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    m = multiplicity[type];
+    p = 1.0;
+    ddf1 = df1 = 0.0;
+
+    for (i = 0; i < m; i++) {
+      ddf1 = p*c - df1*s;
+      df1 = p*s + df1*c;
+      p = ddf1;
+    }
+
+    p = p*cos_shift[type] + df1*sin_shift[type];
+    df1 = df1*cos_shift[type] - ddf1*sin_shift[type];
+    df1 *= -m;
+    p += 1.0;
+
+    if (m == 0) {
+      p = 1.0 + cos_shift[type];
+      df1 = 0.0;
+    }
+
+    if (EFLAG) edihedral = k[type] * p;
+
+    fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+    hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+    fga = fg*ra2inv*rginv;
+    hgb = hg*rb2inv*rginv;
+    gaa = -ra2inv*rg;
+    gbb = rb2inv*rg;
+
+    dtfx = gaa*ax;
+    dtfy = gaa*ay;
+    dtfz = gaa*az;
+    dtgx = fga*ax - hgb*bx;
+    dtgy = fga*ay - hgb*by;
+    dtgz = fga*az - hgb*bz;
+    dthx = gbb*bx;
+    dthy = gbb*by;
+    dthz = gbb*bz;
+
+    df = -k[type] * df1;
+
+    sx2 = df*dtgx;
+    sy2 = df*dtgy;
+    sz2 = df*dtgz;
+
+    f1[0] = df*dtfx;
+    f1[1] = df*dtfy;
+    f1[2] = df*dtfz;
+
+    f2[0] = sx2 - f1[0];
+    f2[1] = sy2 - f1[1];
+    f2[2] = sz2 - f1[2];
+
+    f4[0] = df*dthx;
+    f4[1] = df*dthy;
+    f4[2] = df*dthz;
+
+    f3[0] = -sx2 - f4[0];
+    f3[1] = -sy2 - f4[1];
+    f3[2] = -sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+    // 1-4 LJ and Coulomb interactions
+    // tally energy/virial in pair, using newton_bond as newton flag
+
+    if (weight[type] > 0.0) {
+      itype = atomtype[i1];
+      jtype = atomtype[i4];
+
+      delx = x[i1][0] - x[i4][0];
+      dely = x[i1][1] - x[i4][1];
+      delz = x[i1][2] - x[i4][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      r2inv = 1.0/rsq;
+      r6inv = r2inv*r2inv*r2inv;
+
+      if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
+      else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
+      forcelj = r6inv * (lj14_1[itype][jtype]*r6inv - lj14_2[itype][jtype]);
+      fpair = weight[type] * (forcelj+forcecoul)*r2inv;
+
+      if (EFLAG) {
+        ecoul = weight[type] * forcecoul;
+        evdwl = r6inv * (lj14_3[itype][jtype]*r6inv - lj14_4[itype][jtype]);
+        evdwl *= weight[type];
+      }
+
+      if (NEWTON_BOND || i1 < nlocal) {
+        f[i1][0] += delx*fpair;
+        f[i1][1] += dely*fpair;
+        f[i1][2] += delz*fpair;
+      }
+      if (NEWTON_BOND || i4 < nlocal) {
+        f[i4][0] -= delx*fpair;
+        f[i4][1] -= dely*fpair;
+        f[i4][2] -= delz*fpair;
+      }
+
+      if (EVFLAG) ev_tally_thr(force->pair,i1,i4,nlocal,NEWTON_BOND,
+                               evdwl,ecoul,fpair,delx,dely,delz,thr);
+    }
+  }
+}
diff --git a/src/USER-OMP/dihedral_charmm_omp.h b/src/USER-OMP/dihedral_charmm_omp.h
new file mode 100644
index 0000000000..cd0ae5f8ea
--- /dev/null
+++ b/src/USER-OMP/dihedral_charmm_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(charmm/omp,DihedralCharmmOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_CHARMM_OMP_H
+#define LMP_DIHEDRAL_CHARMM_OMP_H
+
+#include "dihedral_charmm.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralCharmmOMP : public DihedralCharmm, public ThrOMP {
+
+ public:
+  DihedralCharmmOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_class2_omp.cpp b/src/USER-OMP/dihedral_class2_omp.cpp
new file mode 100644
index 0000000000..d21c2869e9
--- /dev/null
+++ b/src/USER-OMP/dihedral_class2_omp.cpp
@@ -0,0 +1,531 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_class2_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.0000001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralClass2OMP::DihedralClass2OMP(class LAMMPS *lmp)
+  : DihedralClass2(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralClass2OMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralClass2OMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+
+  int i1,i2,i3,i4,i,j,k,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral;
+  double r1mag2,r1,r2mag2,r2,r3mag2,r3;
+  double sb1,rb1,sb2,rb2,sb3,rb3,c0,r12c1;
+  double r12c2,costh12,costh13,costh23,sc1,sc2,s1,s2,c;
+  double cosphi,phi,sinphi,a11,a22,a33,a12,a13,a23,sx1,sx2;
+  double sx12,sy1,sy2,sy12,sz1,sz2,sz12,dphi1,dphi2,dphi3;
+  double de_dihedral,t1,t2,t3,t4,cos2phi,cos3phi,bt1,bt2;
+  double bt3,sumbte,db,sumbtf,at1,at2,at3,da,da1,da2,r1_0;
+  double r3_0,dr1,dr2,tk1,tk2,s12,sin2;
+  double dcosphidr[4][3],dphidr[4][3],dbonddr[3][4][3],dthetadr[2][4][3];
+  double fabcd[4][3];
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // distances
+
+    r1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+    r1 = sqrt(r1mag2);
+    r2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+    r2 = sqrt(r2mag2);
+    r3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+    r3 = sqrt(r3mag2);
+
+    sb1 = 1.0/r1mag2;
+    rb1 = 1.0/r1;
+    sb2 = 1.0/r2mag2;
+    rb2 = 1.0/r2;
+    sb3 = 1.0/r3mag2;
+    rb3 = 1.0/r3;
+
+    c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+    // angles
+
+    r12c1 = rb1*rb2;
+    r12c2 = rb2*rb3;
+    costh12 = (vb1x*vb2x + vb1y*vb2y + vb1z*vb2z) * r12c1;
+    costh13 = c0;
+    costh23 = (vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z) * r12c2;
+
+    // cos and sin of 2 angles and final c
+
+    sin2 = MAX(1.0 - costh12*costh12,0.0);
+    sc1 = sqrt(sin2);
+    if (sc1 < SMALL) sc1 = SMALL;
+    sc1 = 1.0/sc1;
+
+    sin2 = MAX(1.0 - costh23*costh23,0.0);
+    sc2 = sqrt(sin2);
+    if (sc2 < SMALL) sc2 = SMALL;
+    sc2 = 1.0/sc2;
+
+    s1 = sc1 * sc1;
+    s2 = sc2 * sc2;
+    s12 = sc1 * sc2;
+    c = (c0 + costh12*costh23) * s12;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+    cosphi = c;
+    phi = acos(c);
+
+    sinphi = sqrt(1.0 - c*c);
+    sinphi = MAX(sinphi,SMALL);
+
+    a11 = -c*sb1*s1;
+    a22 = sb2 * (2.0*costh13*s12 - c*(s1+s2));
+    a33 = -c*sb3*s2;
+    a12 = r12c1 * (costh12*c*s1 + costh23*s12);
+    a13 = rb1*rb3*s12;
+    a23 = r12c2 * (-costh23*c*s2 - costh12*s12);
+
+    sx1  = a11*vb1x + a12*vb2x + a13*vb3x;
+    sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+    sx12 = a13*vb1x + a23*vb2x + a33*vb3x;
+    sy1  = a11*vb1y + a12*vb2y + a13*vb3y;
+    sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+    sy12 = a13*vb1y + a23*vb2y + a33*vb3y;
+    sz1  = a11*vb1z + a12*vb2z + a13*vb3z;
+    sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+    sz12 = a13*vb1z + a23*vb2z + a33*vb3z;
+
+    // set up d(cos(phi))/d(r) and dphi/dr arrays
+
+    dcosphidr[0][0] = -sx1;
+    dcosphidr[0][1] = -sy1;
+    dcosphidr[0][2] = -sz1;
+    dcosphidr[1][0] = sx2 + sx1;
+    dcosphidr[1][1] = sy2 + sy1;
+    dcosphidr[1][2] = sz2 + sz1;
+    dcosphidr[2][0] = sx12 - sx2;
+    dcosphidr[2][1] = sy12 - sy2;
+    dcosphidr[2][2] = sz12 - sz2;
+    dcosphidr[3][0] = -sx12;
+    dcosphidr[3][1] = -sy12;
+    dcosphidr[3][2] = -sz12;
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        dphidr[i][j] = -dcosphidr[i][j] / sinphi;
+
+    // energy
+
+    dphi1 = phi - phi1[type];
+    dphi2 = 2.0*phi - phi2[type];
+    dphi3 = 3.0*phi - phi3[type];
+
+    if (EFLAG) edihedral = k1[type]*(1.0 - cos(dphi1)) +
+                 k2[type]*(1.0 - cos(dphi2)) +
+                 k3[type]*(1.0 - cos(dphi3));
+
+    de_dihedral = k1[type]*sin(dphi1) + 2.0*k2[type]*sin(dphi2) +
+      3.0*k3[type]*sin(dphi3);
+
+    // torsion forces on all 4 atoms
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] = de_dihedral*dphidr[i][j];
+
+    // set up d(bond)/d(r) array
+    // dbonddr(i,j,k) = bond i, atom j, coordinate k
+
+    for (i = 0; i < 3; i++)
+      for (j = 0; j < 4; j++)
+        for (k = 0; k < 3; k++)
+          dbonddr[i][j][k] = 0.0;
+
+    // bond1
+
+    dbonddr[0][0][0] = vb1x / r1;
+    dbonddr[0][0][1] = vb1y / r1;
+    dbonddr[0][0][2] = vb1z / r1;
+    dbonddr[0][1][0] = -vb1x / r1;
+    dbonddr[0][1][1] = -vb1y / r1;
+    dbonddr[0][1][2] = -vb1z / r1;
+
+    // bond2
+
+    dbonddr[1][1][0] = vb2x / r2;
+    dbonddr[1][1][1] = vb2y / r2;
+    dbonddr[1][1][2] = vb2z / r2;
+    dbonddr[1][2][0] = -vb2x / r2;
+    dbonddr[1][2][1] = -vb2y / r2;
+    dbonddr[1][2][2] = -vb2z / r2;
+
+    // bond3
+
+    dbonddr[2][2][0] = vb3x / r3;
+    dbonddr[2][2][1] = vb3y / r3;
+    dbonddr[2][2][2] = vb3z / r3;
+    dbonddr[2][3][0] = -vb3x / r3;
+    dbonddr[2][3][1] = -vb3y / r3;
+    dbonddr[2][3][2] = -vb3z / r3;
+
+    // set up d(theta)/d(r) array
+    // dthetadr(i,j,k) = angle i, atom j, coordinate k
+
+    for (i = 0; i < 2; i++)
+      for (j = 0; j < 4; j++)
+        for (k = 0; k < 3; k++)
+          dthetadr[i][j][k] = 0.0;
+
+    t1 = costh12 / r1mag2;
+    t2 = costh23 / r2mag2;
+    t3 = costh12 / r2mag2;
+    t4 = costh23 / r3mag2;
+
+    // angle12
+
+    dthetadr[0][0][0] = sc1 * ((t1 * vb1x) - (vb2x * r12c1));
+    dthetadr[0][0][1] = sc1 * ((t1 * vb1y) - (vb2y * r12c1));
+    dthetadr[0][0][2] = sc1 * ((t1 * vb1z) - (vb2z * r12c1));
+
+    dthetadr[0][1][0] = sc1 * ((-t1 * vb1x) + (vb2x * r12c1) +
+                               (-t3 * vb2x) + (vb1x * r12c1));
+    dthetadr[0][1][1] = sc1 * ((-t1 * vb1y) + (vb2y * r12c1) +
+                               (-t3 * vb2y) + (vb1y * r12c1));
+    dthetadr[0][1][2] = sc1 * ((-t1 * vb1z) + (vb2z * r12c1) +
+                               (-t3 * vb2z) + (vb1z * r12c1));
+
+    dthetadr[0][2][0] = sc1 * ((t3 * vb2x) - (vb1x * r12c1));
+    dthetadr[0][2][1] = sc1 * ((t3 * vb2y) - (vb1y * r12c1));
+    dthetadr[0][2][2] = sc1 * ((t3 * vb2z) - (vb1z * r12c1));
+
+    // angle23
+
+    dthetadr[1][1][0] = sc2 * ((t2 * vb2x) + (vb3x * r12c2));
+    dthetadr[1][1][1] = sc2 * ((t2 * vb2y) + (vb3y * r12c2));
+    dthetadr[1][1][2] = sc2 * ((t2 * vb2z) + (vb3z * r12c2));
+
+    dthetadr[1][2][0] = sc2 * ((-t2 * vb2x) - (vb3x * r12c2) +
+                               (t4 * vb3x) + (vb2x * r12c2));
+    dthetadr[1][2][1] = sc2 * ((-t2 * vb2y) - (vb3y * r12c2) +
+                               (t4 * vb3y) + (vb2y * r12c2));
+    dthetadr[1][2][2] = sc2 * ((-t2 * vb2z) - (vb3z * r12c2) +
+                               (t4 * vb3z) + (vb2z * r12c2));
+
+    dthetadr[1][3][0] = -sc2 * ((t4 * vb3x) + (vb2x * r12c2));
+    dthetadr[1][3][1] = -sc2 * ((t4 * vb3y) + (vb2y * r12c2));
+    dthetadr[1][3][2] = -sc2 * ((t4 * vb3z) + (vb2z * r12c2));
+
+    // mid-bond/torsion coupling
+    // energy on bond2 (middle bond)
+
+    cos2phi = cos(2.0*phi);
+    cos3phi = cos(3.0*phi);
+
+    bt1 = mbt_f1[type] * cosphi;
+    bt2 = mbt_f2[type] * cos2phi;
+    bt3 = mbt_f3[type] * cos3phi;
+    sumbte = bt1 + bt2 + bt3;
+    db = r2 - mbt_r0[type];
+    if (EFLAG) edihedral += db * sumbte;
+
+    // force on bond2
+
+    bt1 = -mbt_f1[type] * sinphi;
+    bt2 = -2.0 * mbt_f2[type] * sin(2.0*phi);
+    bt3 = -3.0 * mbt_f3[type] * sin(3.0*phi);
+    sumbtf = bt1 + bt2 + bt3;
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] += db*sumbtf*dphidr[i][j] + sumbte*dbonddr[1][i][j];
+
+    // end-bond/torsion coupling
+    // energy on bond1 (first bond)
+
+    bt1 = ebt_f1_1[type] * cosphi;
+    bt2 = ebt_f2_1[type] * cos2phi;
+    bt3 = ebt_f3_1[type] * cos3phi;
+    sumbte = bt1 + bt2 + bt3;
+
+    db = r1 - ebt_r0_1[type];
+    if (EFLAG) edihedral += db * (bt1+bt2+bt3);
+
+    // force on bond1
+
+    bt1 = ebt_f1_1[type] * sinphi;
+    bt2 = 2.0 * ebt_f2_1[type] * sin(2.0*phi);
+    bt3 = 3.0 * ebt_f3_1[type] * sin(3.0*phi);
+    sumbtf = bt1 + bt2 + bt3;
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] -= db*sumbtf*dphidr[i][j] + sumbte*dbonddr[0][i][j];
+
+    // end-bond/torsion coupling
+    // energy on bond3 (last bond)
+
+    bt1 = ebt_f1_2[type] * cosphi;
+    bt2 = ebt_f2_2[type] * cos2phi;
+    bt3 = ebt_f3_2[type] * cos3phi;
+    sumbte = bt1 + bt2 + bt3;
+
+    db = r3 - ebt_r0_2[type];
+    if (EFLAG) edihedral += db * (bt1+bt2+bt3);
+
+    // force on bond3
+
+    bt1 = -ebt_f1_2[type] * sinphi;
+    bt2 = -2.0 * ebt_f2_2[type] * sin(2.0*phi);
+    bt3 = -3.0 * ebt_f3_2[type] * sin(3.0*phi);
+    sumbtf = bt1 + bt2 + bt3;
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] += db*sumbtf*dphidr[i][j] + sumbte*dbonddr[2][i][j];
+
+    // angle/torsion coupling
+    // energy on angle1
+
+    at1 = at_f1_1[type] * cosphi;
+    at2 = at_f2_1[type] * cos2phi;
+    at3 = at_f3_1[type] * cos3phi;
+    sumbte = at1 + at2 + at3;
+
+    da = acos(costh12) - at_theta0_1[type];
+    if (EFLAG) edihedral += da * (at1+at2+at3);
+
+    // force on angle1
+
+    bt1 = at_f1_1[type] * sinphi;
+    bt2 = 2.0 * at_f2_1[type] * sin(2.0*phi);
+    bt3 = 3.0 * at_f3_1[type] * sin(3.0*phi);
+    sumbtf = bt1 + bt2 + bt3;
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] -= da*sumbtf*dphidr[i][j] + sumbte*dthetadr[0][i][j];
+
+    // energy on angle2
+
+    at1 = at_f1_2[type] * cosphi;
+    at2 = at_f2_2[type] * cos2phi;
+    at3 = at_f3_2[type] * cos3phi;
+    sumbte = at1 + at2 + at3;
+
+    da = acos(costh23) - at_theta0_2[type];
+    if (EFLAG) edihedral += da * (at1+at2+at3);
+
+    // force on angle2
+
+    bt1 = -at_f1_2[type] * sinphi;
+    bt2 = -2.0 * at_f2_2[type] * sin(2.0*phi);
+    bt3 = -3.0 * at_f3_2[type] * sin(3.0*phi);
+    sumbtf = bt1 + bt2 + bt3;
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] += da*sumbtf*dphidr[i][j] + sumbte*dthetadr[1][i][j];
+
+    // angle/angle/torsion coupling
+
+    da1 = acos(costh12) - aat_theta0_1[type];
+    da2 = acos(costh23) - aat_theta0_2[type];
+
+    if (EFLAG) edihedral += aat_k[type]*da1*da2*cosphi;
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] -= aat_k[type] *
+          (cosphi * (da2*dthetadr[0][i][j] - da1*dthetadr[1][i][j]) +
+           sinphi * da1*da2*dphidr[i][j]);
+
+    // bond1/bond3 coupling
+
+    if (fabs(bb13t_k[type]) > SMALL) {
+
+      r1_0 = bb13t_r10[type];
+      r3_0 = bb13t_r30[type];
+      dr1 = r1 - r1_0;
+      dr2 = r3 - r3_0;
+      tk1 = -bb13t_k[type] * dr1 / r3;
+      tk2 = -bb13t_k[type] * dr2 / r1;
+
+      if (EFLAG) edihedral += bb13t_k[type]*dr1*dr2;
+
+      fabcd[0][0] += tk2 * vb1x;
+      fabcd[0][1] += tk2 * vb1y;
+      fabcd[0][2] += tk2 * vb1z;
+
+      fabcd[1][0] -= tk2 * vb1x;
+      fabcd[1][1] -= tk2 * vb1y;
+      fabcd[1][2] -= tk2 * vb1z;
+
+      fabcd[2][0] -= tk1 * vb3x;
+      fabcd[2][1] -= tk1 * vb3y;
+      fabcd[2][2] -= tk1 * vb3z;
+
+      fabcd[3][0] += tk1 * vb3x;
+      fabcd[3][1] += tk1 * vb3y;
+      fabcd[3][2] += tk1 * vb3z;
+    }
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += fabcd[0][0];
+      f[i1][1] += fabcd[0][1];
+      f[i1][2] += fabcd[0][2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += fabcd[1][0];
+      f[i2][1] += fabcd[1][1];
+      f[i2][2] += fabcd[1][2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += fabcd[2][0];
+      f[i3][1] += fabcd[2][1];
+      f[i3][2] += fabcd[2][2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += fabcd[3][0];
+      f[i4][1] += fabcd[3][1];
+      f[i4][2] += fabcd[3][2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,
+                   fabcd[0],fabcd[2],fabcd[3],
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/dihedral_class2_omp.h b/src/USER-OMP/dihedral_class2_omp.h
new file mode 100644
index 0000000000..889bbaf920
--- /dev/null
+++ b/src/USER-OMP/dihedral_class2_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(class2/omp,DihedralClass2OMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_CLASS2_OMP_H
+#define LMP_DIHEDRAL_CLASS2_OMP_H
+
+#include "dihedral_class2.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralClass2OMP : public DihedralClass2, public ThrOMP {
+
+ public:
+  DihedralClass2OMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_cosine_shift_exp_omp.cpp b/src/USER-OMP/dihedral_cosine_shift_exp_omp.cpp
new file mode 100644
index 0000000000..ecb1a85fa4
--- /dev/null
+++ b/src/USER-OMP/dihedral_cosine_shift_exp_omp.cpp
@@ -0,0 +1,262 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_cosine_shift_exp_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralCosineShiftExpOMP::DihedralCosineShiftExpOMP(class LAMMPS *lmp)
+  : DihedralCosineShiftExp(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralCosineShiftExpOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralCosineShiftExpOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double ax,ay,az,bx,by,bz,rasq,rbsq,rgsq,rg,rginv,ra2inv,rb2inv,rabinv;
+  double df,fg,hg,fga,hgb,gaa,gbb;
+  double dtfx,dtfy,dtfz,dtgx,dtgy,dtgz,dthx,dthy,dthz;
+  double c,s,sx2,sy2,sz2;
+  double cccpsss,cssmscc,exp2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c,s calculation
+
+    ax = vb1y*vb2zm - vb1z*vb2ym;
+    ay = vb1z*vb2xm - vb1x*vb2zm;
+    az = vb1x*vb2ym - vb1y*vb2xm;
+    bx = vb3y*vb2zm - vb3z*vb2ym;
+    by = vb3z*vb2xm - vb3x*vb2zm;
+    bz = vb3x*vb2ym - vb3y*vb2xm;
+
+    rasq = ax*ax + ay*ay + az*az;
+    rbsq = bx*bx + by*by + bz*bz;
+    rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+    rg = sqrt(rgsq);
+
+    rginv = ra2inv = rb2inv = 0.0;
+    if (rg > 0) rginv = 1.0/rg;
+    if (rasq > 0) ra2inv = 1.0/rasq;
+    if (rbsq > 0) rb2inv = 1.0/rbsq;
+    rabinv = sqrt(ra2inv*rb2inv);
+
+    c = (ax*bx + ay*by + az*bz)*rabinv;
+    s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    double aa=a[type];
+    double uumin=umin[type];
+
+    cccpsss = c*cost[type]+s*sint[type];
+    cssmscc = c*sint[type]-s*cost[type];
+
+    if (doExpansion[type]) {
+      //  |a|<0.001 so use expansions relative precision <1e-5
+      if (EFLAG) edihedral = -0.125*(1+cccpsss)*(4+aa*(cccpsss-1))*uumin;
+      df=0.5*uumin*( cssmscc + 0.5*aa*cccpsss);
+    } else {
+      exp2=exp(0.5*aa*(1+cccpsss));
+      if (EFLAG) edihedral = opt1[type]*(1-exp2);
+      df= 0.5*opt1[type]*aa* ( exp2*cssmscc );
+    }
+
+    fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+    hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+    fga = fg*ra2inv*rginv;
+    hgb = hg*rb2inv*rginv;
+    gaa = -ra2inv*rg;
+    gbb = rb2inv*rg;
+
+    dtfx = gaa*ax;
+    dtfy = gaa*ay;
+    dtfz = gaa*az;
+    dtgx = fga*ax - hgb*bx;
+    dtgy = fga*ay - hgb*by;
+    dtgz = fga*az - hgb*bz;
+    dthx = gbb*bx;
+    dthy = gbb*by;
+    dthz = gbb*bz;
+
+    sx2 = df*dtgx;
+    sy2 = df*dtgy;
+    sz2 = df*dtgz;
+
+    f1[0] = df*dtfx;
+    f1[1] = df*dtfy;
+    f1[2] = df*dtfz;
+
+    f2[0] = sx2 - f1[0];
+    f2[1] = sy2 - f1[1];
+    f2[2] = sz2 - f1[2];
+
+    f4[0] = df*dthx;
+    f4[1] = df*dthy;
+    f4[2] = df*dthz;
+
+    f3[0] = -sx2 - f4[0];
+    f3[1] = -sy2 - f4[1];
+    f3[2] = -sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/dihedral_cosine_shift_exp_omp.h b/src/USER-OMP/dihedral_cosine_shift_exp_omp.h
new file mode 100644
index 0000000000..7e1accebf1
--- /dev/null
+++ b/src/USER-OMP/dihedral_cosine_shift_exp_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(cosine/shift/exp/omp,DihedralCosineShiftExpOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_COSINE_SHIFT_EXP_OMP_H
+#define LMP_DIHEDRAL_COSINE_SHIFT_EXP_OMP_H
+
+#include "dihedral_cosine_shift_exp.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralCosineShiftExpOMP : public DihedralCosineShiftExp, public ThrOMP {
+
+ public:
+  DihedralCosineShiftExpOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_harmonic_omp.cpp b/src/USER-OMP/dihedral_harmonic_omp.cpp
new file mode 100644
index 0000000000..e3b94c8f9a
--- /dev/null
+++ b/src/USER-OMP/dihedral_harmonic_omp.cpp
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_harmonic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralHarmonicOMP::DihedralHarmonicOMP(class LAMMPS *lmp)
+  : DihedralHarmonic(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralHarmonicOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralHarmonicOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+
+  int i1,i2,i3,i4,i,m,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double ax,ay,az,bx,by,bz,rasq,rbsq,rgsq,rg,rginv,ra2inv,rb2inv,rabinv;
+  double df,df1,ddf1,fg,hg,fga,hgb,gaa,gbb;
+  double dtfx,dtfy,dtfz,dtgx,dtgy,dtgz,dthx,dthy,dthz;
+  double c,s,p,sx2,sy2,sz2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c,s calculation
+
+    ax = vb1y*vb2zm - vb1z*vb2ym;
+    ay = vb1z*vb2xm - vb1x*vb2zm;
+    az = vb1x*vb2ym - vb1y*vb2xm;
+    bx = vb3y*vb2zm - vb3z*vb2ym;
+    by = vb3z*vb2xm - vb3x*vb2zm;
+    bz = vb3x*vb2ym - vb3y*vb2xm;
+
+    rasq = ax*ax + ay*ay + az*az;
+    rbsq = bx*bx + by*by + bz*bz;
+    rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+    rg = sqrt(rgsq);
+
+    rginv = ra2inv = rb2inv = 0.0;
+    if (rg > 0) rginv = 1.0/rg;
+    if (rasq > 0) ra2inv = 1.0/rasq;
+    if (rbsq > 0) rb2inv = 1.0/rbsq;
+    rabinv = sqrt(ra2inv*rb2inv);
+
+    c = (ax*bx + ay*by + az*bz)*rabinv;
+    s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    m = multiplicity[type];
+    p = 1.0;
+    ddf1 = df1 = 0.0;
+
+    for (i = 0; i < m; i++) {
+      ddf1 = p*c - df1*s;
+      df1 = p*s + df1*c;
+      p = ddf1;
+    }
+
+    p = p*cos_shift[type] + df1*sin_shift[type];
+    df1 = df1*cos_shift[type] - ddf1*sin_shift[type];
+    df1 *= -m;
+    p += 1.0;
+
+    if (m == 0) {
+      p = 1.0 + cos_shift[type];
+      df1 = 0.0;
+    }
+
+    if (EFLAG) edihedral = k[type] * p;
+
+    fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+    hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+    fga = fg*ra2inv*rginv;
+    hgb = hg*rb2inv*rginv;
+    gaa = -ra2inv*rg;
+    gbb = rb2inv*rg;
+
+    dtfx = gaa*ax;
+    dtfy = gaa*ay;
+    dtfz = gaa*az;
+    dtgx = fga*ax - hgb*bx;
+    dtgy = fga*ay - hgb*by;
+    dtgz = fga*az - hgb*bz;
+    dthx = gbb*bx;
+    dthy = gbb*by;
+    dthz = gbb*bz;
+
+    df = -k[type] * df1;
+
+    sx2 = df*dtgx;
+    sy2 = df*dtgy;
+    sz2 = df*dtgz;
+
+    f1[0] = df*dtfx;
+    f1[1] = df*dtfy;
+    f1[2] = df*dtfz;
+
+    f2[0] = sx2 - f1[0];
+    f2[1] = sy2 - f1[1];
+    f2[2] = sz2 - f1[2];
+
+    f4[0] = df*dthx;
+    f4[1] = df*dthy;
+    f4[2] = df*dthz;
+
+    f3[0] = -sx2 - f4[0];
+    f3[1] = -sy2 - f4[1];
+    f3[2] = -sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/dihedral_harmonic_omp.h b/src/USER-OMP/dihedral_harmonic_omp.h
new file mode 100644
index 0000000000..b55bf015f2
--- /dev/null
+++ b/src/USER-OMP/dihedral_harmonic_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(harmonic/omp,DihedralHarmonicOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_HARMONIC_OMP_H
+#define LMP_DIHEDRAL_HARMONIC_OMP_H
+
+#include "dihedral_harmonic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralHarmonicOMP : public DihedralHarmonic, public ThrOMP {
+
+ public:
+  DihedralHarmonicOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_helix_omp.cpp b/src/USER-OMP/dihedral_helix_omp.cpp
new file mode 100644
index 0000000000..fd7025cdb4
--- /dev/null
+++ b/src/USER-OMP/dihedral_helix_omp.cpp
@@ -0,0 +1,282 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_helix_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "math_const.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+#define SMALLER   0.00001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralHelixOMP::DihedralHelixOMP(class LAMMPS *lmp)
+  : DihedralHelix(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralHelixOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralHelixOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double sb1,sb2,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s1,s12,c,pd,a,a11,a22;
+  double a33,a12,a13,a23,sx2,sy2,sz2;
+  double s2,cx,cy,cz,cmag,dx,phi,si,siinv,sin2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c0 calculation
+
+    sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+    sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+    sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+    rb1 = sqrt(sb1);
+    rb3 = sqrt(sb3);
+
+    c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+    // 1st and 2nd angle
+
+    b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+    b1mag = sqrt(b1mag2);
+    b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+    b2mag = sqrt(b2mag2);
+    b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+    b3mag = sqrt(b3mag2);
+
+    ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+    r12c1 = 1.0 / (b1mag*b2mag);
+    c1mag = ctmp * r12c1;
+
+    ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+    r12c2 = 1.0 / (b2mag*b3mag);
+    c2mag = ctmp * r12c2;
+
+    // cos and sin of 2 angles and final c
+
+    sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+    sc1 = sqrt(sin2);
+    if (sc1 < SMALL) sc1 = SMALL;
+    sc1 = 1.0/sc1;
+
+    sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+    sc2 = sqrt(sin2);
+    if (sc2 < SMALL) sc2 = SMALL;
+    sc2 = 1.0/sc2;
+
+    s1 = sc1 * sc1;
+    s2 = sc2 * sc2;
+    s12 = sc1 * sc2;
+    c = (c0 + c1mag*c2mag) * s12;
+
+    cx = vb1y*vb2z - vb1z*vb2y;
+    cy = vb1z*vb2x - vb1x*vb2z;
+    cz = vb1x*vb2y - vb1y*vb2x;
+    cmag = sqrt(cx*cx + cy*cy + cz*cz);
+    dx = (cx*vb3x + cy*vb3y + cz*vb3z)/cmag/b3mag;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    phi = acos(c);
+    if (dx < 0.0) phi *= -1.0;
+    si = sin(phi);
+    if (fabs(si) < SMALLER) si = SMALLER;
+    siinv = 1.0/si;
+
+    pd = -aphi[type] + 3.0*bphi[type]*sin(3.0*phi)*siinv +
+      cphi[type]*sin(phi + MY_PI4)*siinv;
+
+    if (EFLAG) edihedral = aphi[type]*(1.0 - c) + bphi[type]*(1.0 + cos(3.0*phi)) +
+                 cphi[type]*(1.0 + cos(phi + MY_PI4));
+
+    a = pd;
+    c = c * a;
+    s12 = s12 * a;
+    a11 = c*sb1*s1;
+    a22 = -sb2 * (2.0*c0*s12 - c*(s1+s2));
+    a33 = c*sb3*s2;
+    a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12);
+    a13 = -rb1*rb3*s12;
+    a23 = r12c2 * (c2mag*c*s2 + c1mag*s12);
+
+    sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+    sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+    sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+    f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+    f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+    f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+    f2[0] = -sx2 - f1[0];
+    f2[1] = -sy2 - f1[1];
+    f2[2] = -sz2 - f1[2];
+
+    f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+    f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+    f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+    f3[0] = sx2 - f4[0];
+    f3[1] = sy2 - f4[1];
+    f3[2] = sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/dihedral_helix_omp.h b/src/USER-OMP/dihedral_helix_omp.h
new file mode 100644
index 0000000000..24f16c4d7e
--- /dev/null
+++ b/src/USER-OMP/dihedral_helix_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(helix/omp,DihedralHelixOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_HELIX_OMP_H
+#define LMP_DIHEDRAL_HELIX_OMP_H
+
+#include "dihedral_helix.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralHelixOMP : public DihedralHelix, public ThrOMP {
+
+ public:
+  DihedralHelixOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_multi_harmonic_omp.cpp b/src/USER-OMP/dihedral_multi_harmonic_omp.cpp
new file mode 100644
index 0000000000..a68bebfdf6
--- /dev/null
+++ b/src/USER-OMP/dihedral_multi_harmonic_omp.cpp
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_multi_harmonic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralMultiHarmonicOMP::DihedralMultiHarmonicOMP(class LAMMPS *lmp)
+  : DihedralMultiHarmonic(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralMultiHarmonicOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralMultiHarmonicOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double sb1,sb2,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s1,s12,c,pd,a,a11,a22;
+  double a33,a12,a13,a23,sx2,sy2,sz2;
+  double s2,sin2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c0 calculation
+
+    sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+    sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+    sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+    rb1 = sqrt(sb1);
+    rb3 = sqrt(sb3);
+
+    c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+    // 1st and 2nd angle
+
+    b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+    b1mag = sqrt(b1mag2);
+    b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+    b2mag = sqrt(b2mag2);
+    b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+    b3mag = sqrt(b3mag2);
+
+    ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+    r12c1 = 1.0 / (b1mag*b2mag);
+    c1mag = ctmp * r12c1;
+
+    ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+    r12c2 = 1.0 / (b2mag*b3mag);
+    c2mag = ctmp * r12c2;
+
+    // cos and sin of 2 angles and final c
+
+    sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+    sc1 = sqrt(sin2);
+    if (sc1 < SMALL) sc1 = SMALL;
+    sc1 = 1.0/sc1;
+
+    sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+    sc2 = sqrt(sin2);
+    if (sc2 < SMALL) sc2 = SMALL;
+    sc2 = 1.0/sc2;
+
+    s1 = sc1 * sc1;
+    s2 = sc2 * sc2;
+    s12 = sc1 * sc2;
+    c = (c0 + c1mag*c2mag) * s12;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+    // p = sum (i=1,5) a_i * c**(i-1)
+    // pd = dp/dc
+
+    pd = a2[type] + c*(2.0*a3[type] + c*(3.0*a4[type] + c*4.0*a5[type]));
+
+    if (EFLAG)
+      edihedral = a1[type] + c*(a2[type] + c*(a3[type] + c*(a4[type] + c*a5[type])));
+
+    a = pd;
+    c = c * a;
+    s12 = s12 * a;
+    a11 = c*sb1*s1;
+    a22 = -sb2 * (2.0*c0*s12 - c*(s1+s2));
+    a33 = c*sb3*s2;
+    a12 = -r12c1*(c1mag*c*s1 + c2mag*s12);
+    a13 = -rb1*rb3*s12;
+    a23 = r12c2*(c2mag*c*s2 + c1mag*s12);
+
+    sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+    sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+    sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+    f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+    f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+    f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+    f2[0] = -sx2 - f1[0];
+    f2[1] = -sy2 - f1[1];
+    f2[2] = -sz2 - f1[2];
+
+    f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+    f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+    f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+    f3[0] = sx2 - f4[0];
+    f3[1] = sy2 - f4[1];
+    f3[2] = sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/dihedral_multi_harmonic_omp.h b/src/USER-OMP/dihedral_multi_harmonic_omp.h
new file mode 100644
index 0000000000..c6b80237a5
--- /dev/null
+++ b/src/USER-OMP/dihedral_multi_harmonic_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(multi/harmonic/omp,DihedralMultiHarmonicOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_MULTI_HARMONIC_OMP_H
+#define LMP_DIHEDRAL_MULTI_HARMONIC_OMP_H
+
+#include "dihedral_multi_harmonic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralMultiHarmonicOMP : public DihedralMultiHarmonic, public ThrOMP {
+
+ public:
+  DihedralMultiHarmonicOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_opls_omp.cpp b/src/USER-OMP/dihedral_opls_omp.cpp
new file mode 100644
index 0000000000..3810dc1985
--- /dev/null
+++ b/src/USER-OMP/dihedral_opls_omp.cpp
@@ -0,0 +1,285 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_opls_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+#define SMALLER   0.00001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralOPLSOMP::DihedralOPLSOMP(class LAMMPS *lmp)
+  : DihedralOPLS(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralOPLSOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralOPLSOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+  double sb1,sb2,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s1,s12,c,pd,a,a11,a22;
+  double a33,a12,a13,a23,sx2,sy2,sz2;
+  double s2,cx,cy,cz,cmag,dx,phi,si,siinv,sin2;
+
+  edihedral = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c0 calculation
+
+    sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+    sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+    sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+    rb1 = sqrt(sb1);
+    rb3 = sqrt(sb3);
+
+    c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+    // 1st and 2nd angle
+
+    b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+    b1mag = sqrt(b1mag2);
+    b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+    b2mag = sqrt(b2mag2);
+    b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+    b3mag = sqrt(b3mag2);
+
+    ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+    r12c1 = 1.0 / (b1mag*b2mag);
+    c1mag = ctmp * r12c1;
+
+    ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+    r12c2 = 1.0 / (b2mag*b3mag);
+    c2mag = ctmp * r12c2;
+
+    // cos and sin of 2 angles and final c
+
+    sin2 = MAX(1.0 - c1mag*c1mag,0.0);
+    sc1 = sqrt(sin2);
+    if (sc1 < SMALL) sc1 = SMALL;
+    sc1 = 1.0/sc1;
+
+    sin2 = MAX(1.0 - c2mag*c2mag,0.0);
+    sc2 = sqrt(sin2);
+    if (sc2 < SMALL) sc2 = SMALL;
+    sc2 = 1.0/sc2;
+
+    s1 = sc1 * sc1;
+    s2 = sc2 * sc2;
+    s12 = sc1 * sc2;
+    c = (c0 + c1mag*c2mag) * s12;
+
+    cx = vb1y*vb2z - vb1z*vb2y;
+    cy = vb1z*vb2x - vb1x*vb2z;
+    cz = vb1x*vb2y - vb1y*vb2x;
+    cmag = sqrt(cx*cx + cy*cy + cz*cz);
+    dx = (cx*vb3x + cy*vb3y + cz*vb3z)/cmag/b3mag;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+    // p = sum (i=1,4) k_i * (1 + (-1)**(i+1)*cos(i*phi) )
+    // pd = dp/dc
+
+    phi = acos(c);
+    if (dx < 0.0) phi *= -1.0;
+    si = sin(phi);
+    if (fabs(si) < SMALLER) si = SMALLER;
+    siinv = 1.0/si;
+
+    pd = k1[type] - 2.0*k2[type]*sin(2.0*phi)*siinv +
+      3.0*k3[type]*sin(3.0*phi)*siinv - 4.0*k4[type]*sin(4.0*phi)*siinv;
+
+    if (EFLAG) edihedral = k1[type]*(1.0 + c) + k2[type]*(1.0 - cos(2.0*phi))
+      + k3[type]*(1.0 + cos(3.0*phi)) + k4[type]*(1.0 - cos(4.0*phi));
+
+
+    a = pd;
+    c = c * a;
+    s12 = s12 * a;
+    a11 = c*sb1*s1;
+    a22 = -sb2 * (2.0*c0*s12 - c*(s1+s2));
+    a33 = c*sb3*s2;
+    a12 = -r12c1 * (c1mag*c*s1 + c2mag*s12);
+    a13 = -rb1*rb3*s12;
+    a23 = r12c2 * (c2mag*c*s2 + c1mag*s12);
+
+    sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+    sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+    sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+    f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+    f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+    f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+    f2[0] = -sx2 - f1[0];
+    f2[1] = -sy2 - f1[1];
+    f2[2] = -sz2 - f1[2];
+
+    f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+    f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+    f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+    f3[0] = sx2 - f4[0];
+    f3[1] = sy2 - f4[1];
+    f3[2] = sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/dihedral_opls_omp.h b/src/USER-OMP/dihedral_opls_omp.h
new file mode 100644
index 0000000000..fbb5455f9c
--- /dev/null
+++ b/src/USER-OMP/dihedral_opls_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(opls/omp,DihedralOPLSOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_OPLS_OMP_H
+#define LMP_DIHEDRAL_OPLS_OMP_H
+
+#include "dihedral_opls.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralOPLSOMP : public DihedralOPLS, public ThrOMP {
+
+ public:
+  DihedralOPLSOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/dihedral_table_omp.cpp b/src/USER-OMP/dihedral_table_omp.cpp
new file mode 100644
index 0000000000..55f5ffd97b
--- /dev/null
+++ b/src/USER-OMP/dihedral_table_omp.cpp
@@ -0,0 +1,345 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "dihedral_table_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace DIHEDRAL_TABLE_NS;
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+DihedralTableOMP::DihedralTableOMP(class LAMMPS *lmp)
+  : DihedralTable(lmp), ThrOMP(lmp,THR_DIHEDRAL)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralTableOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->ndihedrallist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void DihedralTableOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,n,type;
+  double edihedral,f1[3],f2[3],f3[3],f4[3];
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const dihedrallist = neighbor->dihedrallist;
+  const int nlocal = atom->nlocal;
+
+  // The dihedral angle "phi" is the angle between n123 and n234
+  // the planes defined by atoms i1,i2,i3, and i2,i3,i4.
+  //
+  // Definitions of vectors: vb12, vb23, vb34, perp12on23
+  //                         proj12on23, perp43on32, proj43on32
+  //
+  //  Note: The positions of the 4 atoms are labeled x[i1], x[i2], x[i3], x[i4]
+  //        (which are also vectors)
+  //
+  //             proj12on23                          proj34on23
+  //             --------->                         ----------->
+  //                           .
+  //                          .
+  //                         .
+  //                  x[i2] .                       x[i3]
+  //    .                __@----------vb23-------->@ . . . .           .
+  //   /|\                /|                        \                  |
+  //    |                /                           \                 |
+  //    |               /                             \                |
+  // perp12vs23        /                               \               |
+  //    |             /                                 \          perp34vs23
+  //    |          vb12                                  \             |
+  //    |           /                                   vb34           |
+  //    |          /                                       \           |
+  //    |         /                                         \          |
+  //    |        /                                           \         |
+  //            @                                             \        |
+  //                                                          _\|     \|/
+  //         x[i1]                                              @
+  //
+  //                                                           x[i4]
+  //
+
+  double vb12[g_dim]; // displacement vector from atom i1 towards atom i2
+  //     vb12[d]       = x[i2][d] - x[i1][d]      (for d=0,1,2)
+  double vb23[g_dim]; // displacement vector from atom i2 towards atom i3
+  //     vb23[d]       = x[i3][d] - x[i2][d]      (for d=0,1,2)
+  double vb34[g_dim]; // displacement vector from atom i3 towards atom i4
+  //     vb34[d]       = x[i4][d] - x[i3][d]      (for d=0,1,2)
+
+  //  n123 & n234: These two unit vectors are normal to the planes
+  //               defined by atoms 1,2,3 and 2,3,4.
+  double n123[g_dim]; //n123=vb23 x vb12 / |vb23 x vb12|  ("x" is cross product)
+  double n234[g_dim]; //n234=vb23 x vb34 / |vb23 x vb34|  ("x" is cross product)
+
+  double proj12on23[g_dim];
+  //    proj12on23[d] = (vb23[d]/|vb23|) * DotProduct(vb12,vb23)/|vb12|*|vb23|
+  double proj34on23[g_dim];
+  //    proj34on23[d] = (vb34[d]/|vb23|) * DotProduct(vb34,vb23)/|vb34|*|vb23|
+  double perp12on23[g_dim];
+  //    perp12on23[d] = v12[d] - proj12on23[d]
+  double perp34on23[g_dim];
+  //    perp34on23[d] = v34[d] - proj34on23[d]
+
+  edihedral = 0.0;
+
+  for (n = nfrom; n < nto; n++) {
+
+    i1 = dihedrallist[n][0];
+    i2 = dihedrallist[n][1];
+    i3 = dihedrallist[n][2];
+    i4 = dihedrallist[n][3];
+    type = dihedrallist[n][4];
+
+    // ------ Step 1: Compute the dihedral angle "phi" ------
+    //
+
+    // Phi() calculates the dihedral angle.
+    // This function also calculates the vectors:
+    // vb12, vb23, vb34, n123, and n234, which we will need later.
+
+
+    double phi = Phi(x[i1], x[i2], x[i3], x[i4], domain,
+                     vb12, vb23, vb34, n123, n234);
+
+
+    // ------ Step 2: Compute the gradient of phi with atomic position: ------
+    //
+    // Gradient variables:
+    //
+    // dphi_dx1, dphi_dx2, dphi_dx3, dphi_dx4 are the gradients of phi with
+    // respect to the atomic positions of atoms i1, i2, i3, i4, respectively.
+    // As an example, consider dphi_dx1.  The d'th element is:
+    double dphi_dx1[g_dim]; //                 d phi
+    double dphi_dx2[g_dim]; // dphi_dx1[d] = ----------    (partial derivatives)
+    double dphi_dx3[g_dim]; //               d x[i1][d]
+    double dphi_dx4[g_dim]; //where d=0,1,2 corresponds to x,y,z  (if g_dim==3)
+
+    double dot123             = DotProduct(vb12, vb23);
+    double dot234             = DotProduct(vb23, vb34);
+    double L23sqr             = DotProduct(vb23, vb23);
+    double L23                = sqrt(L23sqr);   // (central bond length)
+    double inv_L23sqr = 0.0;
+    double inv_L23    = 0.0;
+    if (L23sqr != 0.0) {
+      inv_L23sqr = 1.0 / L23sqr;
+      inv_L23 = 1.0 / L23;
+    }
+    double neg_inv_L23        = -inv_L23;
+    double dot123_over_L23sqr = dot123 * inv_L23sqr;
+    double dot234_over_L23sqr = dot234 * inv_L23sqr;
+
+    for (int d=0; d < g_dim; ++d) {
+      // See figure above for a visual definitions of these vectors:
+      proj12on23[d] = vb23[d] * dot123_over_L23sqr;
+      proj34on23[d] = vb23[d] * dot234_over_L23sqr;
+      perp12on23[d] = vb12[d] - proj12on23[d];
+      perp34on23[d] = vb34[d] - proj34on23[d];
+    }
+
+
+    // --- Compute the gradient vectors dphi/dx1 and dphi/dx4: ---
+
+    // These two gradients point in the direction of n123 and n234,
+    // and are scaled by the distances of atoms 1 and 4 from the central axis.
+    // Distance of atom 1 to central axis:
+    double perp12on23_len = sqrt(DotProduct(perp12on23, perp12on23));
+    // Distance of atom 4 to central axis:
+    double perp34on23_len = sqrt(DotProduct(perp34on23, perp34on23));
+
+    double inv_perp12on23 = 0.0;
+    if (perp12on23_len != 0.0) inv_perp12on23 = 1.0 / perp12on23_len;
+    double inv_perp34on23 = 0.0;
+    if (perp34on23_len != 0.0) inv_perp34on23 = 1.0 / perp34on23_len;
+
+    for (int d=0; d < g_dim; ++d) {
+      dphi_dx1[d] =  n123[d] * inv_perp12on23;
+      dphi_dx4[d] =  n234[d] * inv_perp34on23;
+    }
+
+    // --- Compute the gradient vectors dphi/dx2 and dphi/dx3: ---
+    //
+    // This is more tricky because atoms 2 and 3 are shared by both planes
+    // 123 and 234 (the angle between which defines "phi").  Moving either
+    // one of these atoms effects both the 123 and 234 planes
+    // Both the 123 and 234 planes intersect with the plane perpendicular to the
+    // central bond axis (vb23).  The two lines where these intersections occur
+    // will shift when you move either atom 2 or atom 3.  The angle between
+    // these lines is the dihedral angle, phi.  We can define four quantities:
+    // dphi123_dx2 is the change in "phi" due to the movement of the 123 plane
+    //             ...as a result of moving atom 2.
+    // dphi234_dx2 is the change in "phi" due to the movement of the 234 plane
+    //             ...as a result of moving atom 2.
+    // dphi123_dx3 is the change in "phi" due to the movement of the 123 plane
+    //             ...as a result of moving atom 3.
+    // dphi234_dx3 is the change in "phi" due to the movement of the 234 plane
+    //             ...as a result of moving atom 3.
+
+    double proj12on23_len = dot123 * inv_L23;
+    double proj34on23_len = dot234 * inv_L23;
+    // Interpretation:
+    //The magnitude of "proj12on23_len" is the length of the proj12on23 vector.
+    //The sign is positive if it points in the same direction as the central
+    //bond (vb23).  Otherwise it is negative.  The same goes for "proj34on23".
+    //(In the example figure in the comment above, both variables are positive.)
+
+    // The forumula used in the 8 lines below explained here:
+    //   "supporting_information/doc/gradient_formula_explanation/"
+    double dphi123_dx2_coef = neg_inv_L23 * (L23 + proj12on23_len);
+    double dphi234_dx2_coef = inv_L23 * proj34on23_len;
+
+    double dphi234_dx3_coef = neg_inv_L23 * (L23 + proj34on23_len);
+    double dphi123_dx3_coef = inv_L23 * proj12on23_len;
+
+    for (int d=0; d < g_dim; ++d) {
+      // Recall that the n123 and n234 plane normal vectors are proportional to
+      // the dphi/dx1 and dphi/dx2 gradients vectors
+      // It turns out we can save slightly more CPU cycles by expressing
+      // dphi/dx2 and dphi/dx3 as linear combinations of dphi/dx1 and dphi/dx2
+      // which we computed already (instead of n123 & n234).
+      dphi_dx2[d] = dphi123_dx2_coef*dphi_dx1[d] + dphi234_dx2_coef*dphi_dx4[d];
+      dphi_dx3[d] = dphi123_dx3_coef*dphi_dx1[d] + dphi234_dx3_coef*dphi_dx4[d];
+    }
+
+
+
+
+    #ifdef DIH_DEBUG_NUM
+    // ----- Numerical test? -----
+
+    cerr << "  -- testing gradient for dihedral (n="<<n<<") for atoms ("
+         << i1 << "," << i2 << "," << i3 << "," << i4 << ") --" << endl;
+
+    PrintGradientComparison(*this, dphi_dx1, dphi_dx2, dphi_dx3, dphi_dx4,
+                            domain, x[i1], x[i2], x[i3], x[i4]);
+
+    for (int d=0; d < g_dim; ++d) {
+      // The sum of all the gradients should be near 0. (translational symmetry)
+      cerr <<"sum_gradients["<<d<<"]="<<dphi_dx1[d]<<"+"<<dphi_dx2[d]<<"+"<<dphi_dx3[d]<<"+"<<dphi_dx4[d]<<"="<<dphi_dx1[d]+dphi_dx2[d]+dphi_dx3[d]+dphi_dx4[d]<<endl;
+      // These should sum to zero
+      assert(abs(dphi_dx1[d]+dphi_dx2[d]+dphi_dx3[d]+dphi_dx4[d]) < 0.0002/L23);
+    }
+    #endif // #ifdef DIH_DEBUG_NUM
+
+
+
+
+    // ----- Step 3: Calculate the energy and force in the phi direction -----
+
+    // tabulated force & energy
+    double u, m_du_dphi; //u = energy.   m_du_dphi = "minus" du/dphi
+    assert((0.0 <= phi) && (phi <= TWOPI));
+
+    uf_lookup(type, phi, u, m_du_dphi);
+
+
+    if (EFLAG) edihedral = u;
+
+    // ----- Step 4: Calculate the force direction in real space -----
+
+    // chain rule:
+    //          d U          d U      d phi
+    // -f  =   -----   =    -----  *  -----
+    //          d x         d phi      d x
+    for(int d=0; d < g_dim; ++d) {
+      f1[d] = m_du_dphi * dphi_dx1[d];
+      f2[d] = m_du_dphi * dphi_dx2[d];
+      f3[d] = m_du_dphi * dphi_dx3[d];
+      f4[d] = m_du_dphi * dphi_dx4[d];
+    }
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,edihedral,f1,f3,f4,
+                   vb12[0],vb12[1],vb12[2],vb23[0],vb23[1],vb23[2],vb34[0],
+                   vb34[1],vb34[2],thr);
+  }
+}
diff --git a/src/USER-OMP/dihedral_table_omp.h b/src/USER-OMP/dihedral_table_omp.h
new file mode 100644
index 0000000000..fadb57ed1d
--- /dev/null
+++ b/src/USER-OMP/dihedral_table_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(table/omp,DihedralTableOMP)
+
+#else
+
+#ifndef LMP_DIHEDRAL_TABLE_OMP_H
+#define LMP_DIHEDRAL_TABLE_OMP_H
+
+#include "dihedral_table.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class DihedralTableOMP : public DihedralTable, public ThrOMP {
+
+ public:
+  DihedralTableOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/ewald_omp.cpp b/src/USER-OMP/ewald_omp.cpp
new file mode 100644
index 0000000000..93b61af186
--- /dev/null
+++ b/src/USER-OMP/ewald_omp.cpp
@@ -0,0 +1,415 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "ewald_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+
+#include <math.h>
+
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.00001
+
+/* ---------------------------------------------------------------------- */
+
+EwaldOMP::EwaldOMP(LAMMPS *lmp, int narg, char **arg)
+  : Ewald(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+void EwaldOMP::allocate()
+{
+  Ewald::allocate();
+
+  // always re-allocate for simplicity.
+  delete[] sfacrl;
+  delete[] sfacim;
+
+  sfacrl = new double[kmax3d*comm->nthreads];
+  sfacim = new double[kmax3d*comm->nthreads];
+}
+
+/* ----------------------------------------------------------------------
+   compute the Ewald long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void EwaldOMP::compute(int eflag, int vflag)
+{
+  // set energy/virial flags
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nlocal > nmax) {
+    memory->destroy(ek);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald:ek");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald:sn");
+    kmax_created = kmax;
+  }
+
+  // partial structure factors on each processor
+  // total structure factor by summing over procs
+
+  eik_dot_r();
+  MPI_Allreduce(sfacrl,sfacrl_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(sfacim,sfacim_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+
+  // K-space portion of electric field
+  // double loop over K-vectors and local atoms
+
+  double * const * const f = atom->f;
+  const double * const q = atom->q;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qscale = force->qqrd2e * scale;
+
+  double eng_tmp = 0.0;
+  double v0,v1,v2,v3,v4,v5;
+  v0=v1=v2=v3=v4=v5=0.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag) reduction(+:eng_tmp,v0,v1,v2,v3,v4,v5)
+#endif
+  {
+
+    int i,j,k,ifrom,ito,tid;
+    int kx,ky,kz;
+    double cypz,sypz,exprl,expim,partial;
+
+    loop_setup_thr(ifrom, ito, tid, nlocal, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, 0, NULL, NULL, thr);
+
+    for (i = ifrom; i < ito; i++) {
+      ek[i][0] = 0.0;
+      ek[i][1] = 0.0;
+      ek[i][2] = 0.0;
+    }
+
+    for (k = 0; k < kcount; k++) {
+      kx = kxvecs[k];
+      ky = kyvecs[k];
+      kz = kzvecs[k];
+
+      for (i = ifrom; i < ito; i++) {
+        cypz = cs[ky][1][i]*cs[kz][2][i] - sn[ky][1][i]*sn[kz][2][i];
+        sypz = sn[ky][1][i]*cs[kz][2][i] + cs[ky][1][i]*sn[kz][2][i];
+        exprl = cs[kx][0][i]*cypz - sn[kx][0][i]*sypz;
+        expim = sn[kx][0][i]*cypz + cs[kx][0][i]*sypz;
+        partial = expim*sfacrl_all[k] - exprl*sfacim_all[k];
+        ek[i][0] += partial*eg[k][0];
+        ek[i][1] += partial*eg[k][1];
+        ek[i][2] += partial*eg[k][2];
+
+        if (evflag_atom) {
+          const double partial_peratom = exprl*sfacrl_all[k] + expim*sfacim_all[k];
+          if (eflag_atom) eatom[i] += q[i]*ug[k]*partial_peratom;
+          if (vflag_atom)
+            for (j = 0; j < 6; j++)
+              vatom[i][j] += ug[k]*vg[k][j]*partial_peratom;
+        }
+      }
+    }
+
+    // convert E-field to force
+
+    for (i = ifrom; i < ito; i++) {
+      const double fac = qscale*q[i];
+      f[i][0] += fac*ek[i][0];
+      f[i][1] += fac*ek[i][1];
+      f[i][2] += fac*ek[i][2];
+    }
+
+    // global energy
+
+    if (eflag_global) {
+#if defined(_OPENMP)
+#pragma omp for private(k)
+#endif
+      for (k = 0; k < kcount; k++)
+        eng_tmp += ug[k] * (sfacrl_all[k]*sfacrl_all[k] +
+                            sfacim_all[k]*sfacim_all[k]);
+    }
+
+    // global virial
+
+    if (vflag_global) {
+#if defined(_OPENMP)
+#pragma omp for private(k)
+#endif
+      for (k = 0; k < kcount; k++) {
+        double uk = ug[k] * (sfacrl_all[k]*sfacrl_all[k] + sfacim_all[k]*sfacim_all[k]);
+        v0 += uk*vg[k][0];
+        v1 += uk*vg[k][1];
+        v2 += uk*vg[k][2];
+        v3 += uk*vg[k][3];
+        v4 += uk*vg[k][4];
+        v5 += uk*vg[k][5];
+      }
+    }
+
+    // per-atom energy/virial
+    // energy includes self-energy correction
+
+    if (evflag_atom) {
+      if (eflag_atom) {
+        for (i = ifrom; i < ito; i++) {
+          eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
+            (g_ewald*g_ewald*volume);
+          eatom[i] *= qscale;
+        }
+      }
+
+      if (vflag_atom)
+        for (i = ifrom; i < ito; i++)
+          for (j = 0; j < 6; j++) vatom[i][j] *= q[i]*qscale;
+    }
+
+    reduce_thr(this, eflag,vflag,thr);
+  } // end of omp parallel region
+
+  if (eflag_global) {
+    eng_tmp -= g_ewald*qsqsum/MY_PIS +
+      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy = eng_tmp * qscale;
+  }
+
+  if (vflag_global) {
+    virial[0] = v0 * qscale;
+    virial[1] = v1 * qscale;
+    virial[2] = v2 * qscale;
+    virial[3] = v3 * qscale;
+    virial[4] = v4 * qscale;
+    virial[5] = v5 * qscale;
+  }
+
+  if (slabflag) slabcorr();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void EwaldOMP::eik_dot_r()
+{
+  const double * const * const x = atom->x;
+  const double * const q = atom->q;
+  const int nlocal = atom->nlocal;
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int i,ifrom,ito,k,l,m,n,ic,tid;
+    double cstr1,sstr1,cstr2,sstr2,cstr3,sstr3,cstr4,sstr4;
+    double sqk,clpm,slpm;
+
+    loop_setup_thr(ifrom, ito, tid, nlocal, nthreads);
+
+    double * const sfacrl_thr = sfacrl + tid*kmax3d;
+    double * const sfacim_thr = sfacim + tid*kmax3d;
+
+    n = 0;
+
+    // (k,0,0), (0,l,0), (0,0,m)
+
+    for (ic = 0; ic < 3; ic++) {
+      sqk = unitk[ic]*unitk[ic];
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        for (i = ifrom; i < ito; i++) {
+          cs[0][ic][i] = 1.0;
+          sn[0][ic][i] = 0.0;
+          cs[1][ic][i] = cos(unitk[ic]*x[i][ic]);
+          sn[1][ic][i] = sin(unitk[ic]*x[i][ic]);
+          cs[-1][ic][i] = cs[1][ic][i];
+          sn[-1][ic][i] = -sn[1][ic][i];
+          cstr1 += q[i]*cs[1][ic][i];
+          sstr1 += q[i]*sn[1][ic][i];
+        }
+        sfacrl_thr[n] = cstr1;
+        sfacim_thr[n++] = sstr1;
+      }
+    }
+
+    for (m = 2; m <= kmax; m++) {
+      for (ic = 0; ic < 3; ic++) {
+        sqk = m*unitk[ic] * m*unitk[ic];
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          for (i = ifrom; i < ito; i++) {
+            cs[m][ic][i] = cs[m-1][ic][i]*cs[1][ic][i] -
+              sn[m-1][ic][i]*sn[1][ic][i];
+            sn[m][ic][i] = sn[m-1][ic][i]*cs[1][ic][i] +
+              cs[m-1][ic][i]*sn[1][ic][i];
+            cs[-m][ic][i] = cs[m][ic][i];
+            sn[-m][ic][i] = -sn[m][ic][i];
+            cstr1 += q[i]*cs[m][ic][i];
+            sstr1 += q[i]*sn[m][ic][i];
+          }
+          sfacrl_thr[n] = cstr1;
+          sfacim_thr[n++] = sstr1;
+        }
+      }
+    }
+
+    // 1 = (k,l,0), 2 = (k,-l,0)
+
+    for (k = 1; k <= kxmax; k++) {
+      for (l = 1; l <= kymax; l++) {
+        sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]);
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          cstr2 = 0.0;
+          sstr2 = 0.0;
+          for (i = ifrom; i < ito; i++) {
+            cstr1 += q[i]*(cs[k][0][i]*cs[l][1][i] - sn[k][0][i]*sn[l][1][i]);
+            sstr1 += q[i]*(sn[k][0][i]*cs[l][1][i] + cs[k][0][i]*sn[l][1][i]);
+            cstr2 += q[i]*(cs[k][0][i]*cs[l][1][i] + sn[k][0][i]*sn[l][1][i]);
+            sstr2 += q[i]*(sn[k][0][i]*cs[l][1][i] - cs[k][0][i]*sn[l][1][i]);
+          }
+          sfacrl_thr[n] = cstr1;
+          sfacim_thr[n++] = sstr1;
+          sfacrl_thr[n] = cstr2;
+          sfacim_thr[n++] = sstr2;
+        }
+      }
+    }
+
+    // 1 = (0,l,m), 2 = (0,l,-m)
+
+    for (l = 1; l <= kymax; l++) {
+      for (m = 1; m <= kzmax; m++) {
+        sqk = (l*unitk[1] * l*unitk[1]) + (m*unitk[2] * m*unitk[2]);
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          cstr2 = 0.0;
+          sstr2 = 0.0;
+          for (i = ifrom; i < ito; i++) {
+            cstr1 += q[i]*(cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i]);
+            sstr1 += q[i]*(sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i]);
+            cstr2 += q[i]*(cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i]);
+            sstr2 += q[i]*(sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i]);
+          }
+          sfacrl_thr[n] = cstr1;
+          sfacim_thr[n++] = sstr1;
+          sfacrl_thr[n] = cstr2;
+          sfacim_thr[n++] = sstr2;
+        }
+      }
+    }
+
+    // 1 = (k,0,m), 2 = (k,0,-m)
+
+    for (k = 1; k <= kxmax; k++) {
+      for (m = 1; m <= kzmax; m++) {
+        sqk = (k*unitk[0] * k*unitk[0]) + (m*unitk[2] * m*unitk[2]);
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          cstr2 = 0.0;
+          sstr2 = 0.0;
+          for (i = ifrom; i < ito; i++) {
+            cstr1 += q[i]*(cs[k][0][i]*cs[m][2][i] - sn[k][0][i]*sn[m][2][i]);
+            sstr1 += q[i]*(sn[k][0][i]*cs[m][2][i] + cs[k][0][i]*sn[m][2][i]);
+            cstr2 += q[i]*(cs[k][0][i]*cs[m][2][i] + sn[k][0][i]*sn[m][2][i]);
+            sstr2 += q[i]*(sn[k][0][i]*cs[m][2][i] - cs[k][0][i]*sn[m][2][i]);
+          }
+          sfacrl_thr[n] = cstr1;
+          sfacim_thr[n++] = sstr1;
+          sfacrl_thr[n] = cstr2;
+          sfacim_thr[n++] = sstr2;
+        }
+      }
+    }
+
+    // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
+
+    for (k = 1; k <= kxmax; k++) {
+      for (l = 1; l <= kymax; l++) {
+        for (m = 1; m <= kzmax; m++) {
+          sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]) +
+            (m*unitk[2] * m*unitk[2]);
+          if (sqk <= gsqmx) {
+            cstr1 = 0.0;
+            sstr1 = 0.0;
+            cstr2 = 0.0;
+            sstr2 = 0.0;
+            cstr3 = 0.0;
+            sstr3 = 0.0;
+            cstr4 = 0.0;
+            sstr4 = 0.0;
+            for (i = ifrom; i < ito; i++) {
+              clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+              slpm = sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+              cstr1 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+              sstr1 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+              clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+              slpm = -sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+              cstr2 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+              sstr2 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+              clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+              slpm = sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+              cstr3 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+              sstr3 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+              clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+              slpm = -sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+              cstr4 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+              sstr4 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+            }
+            sfacrl_thr[n] = cstr1;
+            sfacim_thr[n++] = sstr1;
+            sfacrl_thr[n] = cstr2;
+            sfacim_thr[n++] = sstr2;
+            sfacrl_thr[n] = cstr3;
+            sfacim_thr[n++] = sstr3;
+            sfacrl_thr[n] = cstr4;
+            sfacim_thr[n++] = sstr4;
+          }
+        }
+      }
+    }
+
+    sync_threads();
+    data_reduce_thr(sfacrl,kmax3d,nthreads,1,tid);
+    data_reduce_thr(sfacim,kmax3d,nthreads,1,tid);
+
+  } // end of parallel region
+}
diff --git a/src/USER-OMP/ewald_omp.h b/src/USER-OMP/ewald_omp.h
new file mode 100644
index 0000000000..222806f15a
--- /dev/null
+++ b/src/USER-OMP/ewald_omp.h
@@ -0,0 +1,42 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(ewald/omp,EwaldOMP)
+
+#else
+
+#ifndef LMP_EWALD_OMP_H
+#define LMP_EWALD_OMP_H
+
+#include "ewald.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+  class EwaldOMP : public Ewald, public ThrOMP {
+ public:
+  EwaldOMP(class LAMMPS *, int, char **);
+  virtual ~EwaldOMP() { };
+  virtual void allocate();
+  virtual void compute(int, int);
+
+ protected:
+  virtual void eik_dot_r();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/fix_gravity_omp.cpp b/src/USER-OMP/fix_gravity_omp.cpp
new file mode 100644
index 0000000000..fd32a00e58
--- /dev/null
+++ b/src/USER-OMP/fix_gravity_omp.cpp
@@ -0,0 +1,112 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "fix_gravity_omp.h"
+#include "atom.h"
+#include "update.h"
+#include "domain.h"
+#include "input.h"
+#include "modify.h"
+#include "respa.h"
+#include "variable.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+enum{CHUTE,SPHERICAL,GRADIENT,VECTOR};
+enum{CONSTANT,EQUAL};
+
+/* ---------------------------------------------------------------------- */
+
+FixGravityOMP::FixGravityOMP(LAMMPS *lmp, int narg, char **arg) :
+  FixGravity(lmp, narg, arg) { }
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityOMP::post_force(int vflag)
+{
+  // update gravity due to variables
+
+  if (varflag != CONSTANT) {
+    modify->clearstep_compute();
+    if (mstyle == EQUAL) magnitude = input->variable->compute_equal(mvar);
+    if (vstyle == EQUAL) magnitude = input->variable->compute_equal(vvar);
+    if (pstyle == EQUAL) magnitude = input->variable->compute_equal(pvar);
+    if (tstyle == EQUAL) magnitude = input->variable->compute_equal(tvar);
+    if (xstyle == EQUAL) magnitude = input->variable->compute_equal(xvar);
+    if (ystyle == EQUAL) magnitude = input->variable->compute_equal(yvar);
+    if (zstyle == EQUAL) magnitude = input->variable->compute_equal(zvar);
+    modify->addstep_compute(update->ntimestep + 1);
+
+    set_acceleration();
+  }
+
+  const double * const * const x = atom->x;
+  double * const * const f = atom->f;
+  double * const rmass = atom->rmass;
+  double * const mass = atom->mass;
+  int * const mask = atom->mask;
+  int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double xacc_thr = xacc;
+  const double yacc_thr = yacc;
+  const double zacc_thr = zacc;
+  double massone;
+
+  int i;
+  eflag = 0;
+  double grav = 0.0;
+
+  if (rmass) {
+#if defined(_OPENMP)
+#pragma omp parallel for private(i,massone) default(none) reduction(-:grav)
+#endif
+    for (i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+        massone = rmass[i];
+        f[i][0] += massone*xacc_thr;
+        f[i][1] += massone*yacc_thr;
+        f[i][2] += massone*zacc_thr;
+        grav -= massone * (xacc_thr*x[i][0] + yacc_thr*x[i][1] + zacc_thr*x[i][2]);
+      }
+  } else {
+#if defined(_OPENMP)
+#pragma omp parallel for private(i,massone) default(none) reduction(-:grav)
+#endif
+    for (i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+        massone = mass[type[i]];
+        f[i][0] += massone*xacc_thr;
+        f[i][1] += massone*yacc_thr;
+        f[i][2] += massone*zacc_thr;
+        grav -= massone * (xacc_thr*x[i][0] + yacc_thr*x[i][1] + zacc_thr*x[i][2]);
+      }
+  }
+  egrav = grav;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityOMP::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+}
diff --git a/src/USER-OMP/fix_gravity_omp.h b/src/USER-OMP/fix_gravity_omp.h
new file mode 100644
index 0000000000..ca104a1a9a
--- /dev/null
+++ b/src/USER-OMP/fix_gravity_omp.h
@@ -0,0 +1,38 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(gravity/omp,FixGravityOMP)
+
+#else
+
+#ifndef LMP_FIX_GRAVITY_OMP_H
+#define LMP_FIX_GRAVITY_OMP_H
+
+#include "fix_gravity.h"
+
+namespace LAMMPS_NS {
+
+class FixGravityOMP : public FixGravity {
+
+ public:
+  FixGravityOMP(class LAMMPS *, int, char **);
+  virtual void post_force(int);
+  virtual void post_force_respa(int, int, int);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/fix_nve_sphere_omp.cpp b/src/USER-OMP/fix_nve_sphere_omp.cpp
new file mode 100644
index 0000000000..bbe009ecc9
--- /dev/null
+++ b/src/USER-OMP/fix_nve_sphere_omp.cpp
@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "string.h"
+#include "fix_nve_sphere_omp.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "update.h"
+#include "respa.h"
+#include "force.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define INERTIA 0.4          // moment of inertia prefactor for sphere
+
+enum{NONE,DIPOLE};
+
+/* ---------------------------------------------------------------------- */
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVESphereOMP::initial_integrate(int vflag)
+{
+  double * const * const x = atom->x;
+  double * const * const v = atom->v;
+  const double * const * const f = atom->f;
+  double * const * const omega = atom->omega;
+  const double * const * const torque = atom->torque;
+  const double * const radius = atom->radius;
+  const double * const rmass = atom->rmass;
+  const int * const mask = atom->mask;
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : atom->nlocal;
+  int i;
+
+  // set timestep here since dt may have changed or come via rRESPA
+  const double dtfrotate = dtf / INERTIA;
+
+  // update v,x,omega for all particles
+  // d_omega/dt = torque / inertia
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none)
+#endif
+  for (i = 0; i < nlocal; i++) {
+    if (mask[i] & groupbit) {
+      const double dtfm = dtf / rmass[i];
+      v[i][0] += dtfm * f[i][0];
+      v[i][1] += dtfm * f[i][1];
+      v[i][2] += dtfm * f[i][2];
+      x[i][0] += dtv * v[i][0];
+      x[i][1] += dtv * v[i][1];
+      x[i][2] += dtv * v[i][2];
+
+      const double dtirotate = dtfrotate / (radius[i]*radius[i]*rmass[i]);
+      omega[i][0] += dtirotate * torque[i][0];
+      omega[i][1] += dtirotate * torque[i][1];
+      omega[i][2] += dtirotate * torque[i][2];
+    }
+  }
+
+  // update mu for dipoles
+  // d_mu/dt = omega cross mu
+  // renormalize mu to dipole length
+
+  if (extra == DIPOLE) {
+    double * const * const mu = atom->mu;
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none)
+#endif
+    for (i = 0; i < nlocal; i++) {
+      double g0,g1,g2,msq,scale;
+      if (mask[i] & groupbit) {
+        if (mu[i][3] > 0.0) {
+          g0 = mu[i][0] + dtv * (omega[i][1]*mu[i][2]-omega[i][2]*mu[i][1]);
+          g1 = mu[i][1] + dtv * (omega[i][2]*mu[i][0]-omega[i][0]*mu[i][2]);
+          g2 = mu[i][2] + dtv * (omega[i][0]*mu[i][1]-omega[i][1]*mu[i][0]);
+          msq = g0*g0 + g1*g1 + g2*g2;
+          scale = mu[i][3]/sqrt(msq);
+          mu[i][0] = g0*scale;
+          mu[i][1] = g1*scale;
+          mu[i][2] = g2*scale;
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVESphereOMP::final_integrate()
+{
+  double * const * const v = atom->v;
+  const double * const * const f = atom->f;
+  double * const * const omega = atom->omega;
+  const double * const * const torque = atom->torque;
+  const double * const rmass = atom->rmass;
+  const double * const radius = atom->radius;
+  const int * const mask = atom->mask;
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst : atom->nlocal;
+  int i;
+
+  // set timestep here since dt may have changed or come via rRESPA
+
+  const double dtfrotate = dtf / INERTIA;
+
+  // update v,omega for all particles
+  // d_omega/dt = torque / inertia
+
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none)
+#endif
+  for (i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit) {
+      const double dtfm = dtf / rmass[i];
+      v[i][0] += dtfm * f[i][0];
+      v[i][1] += dtfm * f[i][1];
+      v[i][2] += dtfm * f[i][2];
+
+      const double dtirotate = dtfrotate / (radius[i]*radius[i]*rmass[i]);
+      omega[i][0] += dtirotate * torque[i][0];
+      omega[i][1] += dtirotate * torque[i][1];
+      omega[i][2] += dtirotate * torque[i][2];
+    }
+}
diff --git a/src/USER-OMP/fix_nve_sphere_omp.h b/src/USER-OMP/fix_nve_sphere_omp.h
new file mode 100644
index 0000000000..ea965f1bb1
--- /dev/null
+++ b/src/USER-OMP/fix_nve_sphere_omp.h
@@ -0,0 +1,39 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/sphere/omp,FixNVESphereOMP)
+
+#else
+
+#ifndef LMP_FIX_NVE_SPHERE_OMP_H
+#define LMP_FIX_NVE_SPHERE_OMP_H
+
+#include "fix_nve_sphere.h"
+
+namespace LAMMPS_NS {
+
+class FixNVESphereOMP : public FixNVESphere {
+ public:
+  FixNVESphereOMP(class LAMMPS *lmp, int narg, char **arg) :
+    FixNVESphere(lmp, narg, arg) {};
+
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/fix_omp.cpp b/src/USER-OMP/fix_omp.cpp
new file mode 100644
index 0000000000..7e770ce907
--- /dev/null
+++ b/src/USER-OMP/fix_omp.cpp
@@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   OpenMP based threading support for LAMMPS
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "update.h"
+#include "integrate.h"
+#include "min.h"
+
+#include "fix_omp.h"
+#include "thr_data.h"
+#include "thr_omp.h"
+
+#include "pair_hybrid.h"
+#include "bond_hybrid.h"
+#include "angle_hybrid.h"
+#include "dihedral_hybrid.h"
+#include "improper_hybrid.h"
+#include "kspace.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "suffix.h"
+
+#if defined(LMP_USER_CUDA)
+#include "cuda_modify_flags.h"
+#endif
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+#if defined(LMP_USER_CUDA)
+using namespace FixConstCuda;
+#endif
+
+static int get_tid()
+{
+  int tid = 0;
+#if defined(_OPENMP)
+  tid = omp_get_thread_num();
+#endif
+  return tid;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixOMP::FixOMP(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg),
+              thr(NULL), last_omp_style(NULL), last_pair_hybrid(NULL),
+              _nthr(-1), _neighbor(true), _newton(false)
+{
+  if ((narg < 4) || (narg > 6)) error->all(FLERR,"Illegal fix OMP command");
+  if (strcmp(arg[1],"all") != 0) error->all(FLERR,"Illegal fix OMP command");
+
+  int nthreads = 1;
+  if (narg > 3) {
+#if defined(_OPENMP)
+    if (strcmp(arg[3],"*") == 0)
+#pragma omp parallel default(none) shared(nthreads)
+      nthreads = omp_get_num_threads();
+    else
+      nthreads = atoi(arg[3]);
+#endif
+  }
+
+  if (nthreads < 1)
+    error->all(FLERR,"Illegal number of threads requested.");
+
+  if (nthreads != comm->nthreads) {
+#if defined(_OPENMP)
+    omp_set_num_threads(nthreads);
+#endif
+    comm->nthreads = nthreads;
+    if (comm->me == 0) {
+      if (screen)
+        fprintf(screen,"  reset %d OpenMP thread(s) per MPI task\n", nthreads);
+      if (logfile)
+        fprintf(logfile,"  reset %d OpenMP thread(s) per MPI task\n", nthreads);
+    }
+  }
+
+  if (narg > 4) {
+    if (strcmp(arg[4],"force/neigh") == 0)
+      _neighbor = true;
+    else if (strcmp(arg[4],"force") == 0)
+      _neighbor = false;
+    else
+      error->all(FLERR,"Illegal fix omp mode requested.");
+
+    if (comm->me == 0) {
+      const char * const mode = _neighbor ? "OpenMP capable" : "serial";
+
+      if (screen)
+        fprintf(screen,"  using %s neighbor list subroutines\n", mode);
+      if (logfile)
+        fprintf(logfile,"  using %s neighbor list subroutines\n", mode);
+    }
+  }
+
+#if 0 /* to be enabled when we can switch between half and full neighbor lists */
+  if (narg > 5) {
+    if (strcmp(arg[5],"neigh/half") == 0)
+      _newton = true;
+    else if (strcmp(arg[5],"neigh/full") == 0)
+        _newton = false;
+    else
+      error->all(FLERR,"Illegal fix OMP command");
+
+    if (comm->me == 0) {
+      const char * const mode = _newton ? "half" : "full";
+
+      if (screen)
+        fprintf(screen,"  using /omp styles with %s neighbor list builds\n", mode);
+      if (logfile)
+        fprintf(logfile,"  using /omp styles with %s neighbor list builds\n", mode);
+    }
+  }
+#endif
+
+  // allocate list for per thread accumulator manager class instances
+  // and then have each thread create an instance of this class to
+  // encourage the OS to use storage that is "close" to each thread's CPU.
+  thr = new ThrData *[nthreads];
+  _nthr = nthreads;
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const int tid = get_tid();
+    thr[tid] = new ThrData(tid);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixOMP::~FixOMP()
+{
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const int tid = get_tid();
+    delete thr[tid];
+  }
+  delete[] thr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixOMP::setmask()
+{
+  // compatibility with USER-CUDA
+  // our fix doesn't need any data transfer.
+#if defined(LMP_USER_CUDA)
+  if (lmp->cuda) {
+    int mask = 0;
+    mask |= PRE_FORCE_CUDA;
+    mask |= PRE_FORCE_RESPA;
+    mask |= MIN_PRE_FORCE;
+    return mask;
+  }
+#endif
+
+  int mask = 0;
+  mask |= PRE_FORCE;
+  mask |= PRE_FORCE_RESPA;
+  mask |= MIN_PRE_FORCE;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixOMP::init()
+{
+  if (strstr(update->integrate_style,"respa") != NULL)
+    error->all(FLERR,"Cannot use r-RESPA with /omp styles");
+
+  int check_hybrid;
+  last_pair_hybrid = NULL;
+  last_omp_style = NULL;
+  const char *last_omp_name = NULL;
+  const char *last_hybrid_name = NULL;
+  const char *last_force_name = NULL;
+
+// determine which is the last force style with OpenMP
+// support as this is the one that has to reduce the forces
+
+#define CheckStyleForOMP(name)                                                \
+  check_hybrid = 0;                                                        \
+  if (force->name) {                                                        \
+    if ( (strcmp(force->name ## _style,"hybrid") == 0) ||                \
+         (strcmp(force->name ## _style,"hybrid/overlay") == 0) )        \
+      check_hybrid=1;                                                        \
+    if (force->name->suffix_flag & Suffix::OMP) {                        \
+      last_force_name = (const char *) #name;                                \
+      last_omp_name = force->name ## _style;                                \
+      last_omp_style = (void *) force->name;                                \
+    }                                                                        \
+  }
+
+#define CheckHybridForOMP(name,Class)                                        \
+  if (check_hybrid) {                                                        \
+    Class ## Hybrid *style = (Class ## Hybrid *) force->name;                \
+    for (int i=0; i < style->nstyles; i++) {                                \
+      if (style->styles[i]->suffix_flag & Suffix::OMP) {                \
+        last_force_name = (const char *) #name;                                \
+        last_omp_name = style->keywords[i];                                \
+        last_omp_style = style->styles[i];                                \
+      }                                                                        \
+    }                                                                        \
+  }
+
+  CheckStyleForOMP(pair);
+  CheckHybridForOMP(pair,Pair);
+  if (check_hybrid) {
+    last_pair_hybrid = last_omp_style;
+    last_hybrid_name = last_omp_name;
+  }
+
+  CheckStyleForOMP(bond);
+  CheckHybridForOMP(bond,Bond);
+
+  CheckStyleForOMP(angle);
+  CheckHybridForOMP(angle,Angle);
+
+  CheckStyleForOMP(dihedral);
+  CheckHybridForOMP(dihedral,Dihedral);
+
+  CheckStyleForOMP(improper);
+  CheckHybridForOMP(improper,Improper);
+
+  CheckStyleForOMP(kspace);
+
+#undef CheckStyleForOMP
+#undef CheckHybridForOMP
+  set_neighbor_omp();
+
+  // diagnostic output
+  if (comm->me == 0) {
+    if (last_omp_style) {
+      if (last_pair_hybrid) {
+        if (screen)
+          fprintf(screen,"Hybrid pair style last /omp style %s\n", last_hybrid_name);
+        if (logfile)
+          fprintf(logfile,"Hybrid pair style last /omp style %s\n", last_hybrid_name);
+      }
+      if (screen)
+        fprintf(screen,"Last active /omp style is %s_style %s\n",
+                last_force_name, last_omp_name);
+      if (logfile)
+        fprintf(logfile,"Last active /omp style is %s_style %s\n",
+                last_force_name, last_omp_name);
+    } else {
+      if (screen)
+        fprintf(screen,"No /omp style for force computation currently active\n");
+      if (logfile)
+        fprintf(screen,"No /omp style for force computation currently active\n");
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixOMP::set_neighbor_omp()
+{
+  // select or deselect multi-threaded neighbor
+  // list build depending on setting in package omp.
+  // NOTE: since we are at the top of the list of
+  // fixes, we cannot adjust neighbor lists from
+  // other fixes. those have to be re-implemented
+  // as /omp fix styles. :-(
+
+  const int neigh_omp = _neighbor ? 1 : 0;
+  const int nrequest = neighbor->nrequest;
+
+  for (int i = 0; i < nrequest; ++i)
+    neighbor->requests[i]->omp = neigh_omp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+// adjust size and clear out per thread accumulator arrays
+void FixOMP::pre_force(int)
+{
+  const int nall = atom->nlocal + atom->nghost;
+
+  double **f = atom->f;
+  double **torque = atom->torque;
+  double *erforce = atom->erforce;
+  double *de = atom->de;
+  double *drho = atom->drho;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(f,torque,erforce,de,drho)
+#endif
+  {
+    const int tid = get_tid();
+    thr[tid]->check_tid(tid);
+    thr[tid]->init_force(nall,f,torque,erforce,de,drho);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixOMP::memory_usage()
+{
+  double bytes = comm->nthreads * (sizeof(ThrData *) + sizeof(ThrData));
+  bytes += comm->nthreads * thr[0]->memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/fix_omp.h b/src/USER-OMP/fix_omp.h
new file mode 100644
index 0000000000..d797b08418
--- /dev/null
+++ b/src/USER-OMP/fix_omp.h
@@ -0,0 +1,73 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(OMP,FixOMP)
+
+#else
+
+#ifndef LMP_FIX_OMP_H
+#define LMP_FIX_OMP_H
+
+#include "fix.h"
+
+
+namespace LAMMPS_NS {
+
+class ThrData;
+
+class FixOMP : public Fix {
+  friend class ThrOMP;
+
+ public:
+  FixOMP(class LAMMPS *, int, char **);
+  virtual ~FixOMP();
+  virtual int setmask();
+  virtual void init();
+  virtual void pre_force(int);
+
+  virtual void setup_pre_force(int vflag)           { pre_force(vflag); };
+  virtual void min_setup_pre_force(int vflag)       { pre_force(vflag); };
+  virtual void min_pre_force(int vflag)             { pre_force(vflag); };
+  virtual void setup_pre_force_respa(int vflag,int) { pre_force(vflag); };
+  virtual void pre_force_respa(int vflag,int,int)   { pre_force(vflag); };
+
+  virtual double memory_usage();
+
+  ThrData *get_thr(int tid) { return thr[tid]; };
+  int get_nthr() const { return _nthr; }
+
+ protected:
+  ThrData **thr;
+  void *last_omp_style;    // pointer to the style that needs
+                           // to do the general force reduction
+  void *last_pair_hybrid;  // pointer to the pair style that needs
+                           // to call virial_fdot_compute()
+
+ public:
+  bool get_neighbor() const {return _neighbor;};
+  bool get_newton() const   {return _newton;};
+
+ private:
+  int  _nthr;     // number of currently active ThrData object
+  bool _neighbor; // en/disable threads for neighbor list construction
+  bool _newton;   // en/disable newton's 3rd law for local atoms.
+
+  void set_neighbor_omp();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/fix_peri_neigh_omp.cpp b/src/USER-OMP/fix_peri_neigh_omp.cpp
new file mode 100644
index 0000000000..000278c8b7
--- /dev/null
+++ b/src/USER-OMP/fix_peri_neigh_omp.cpp
@@ -0,0 +1,50 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Parks (SNL)
+------------------------------------------------------------------------- */
+
+#include "fix_peri_neigh_omp.h"
+#include "fix_omp.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+void FixPeriNeighOMP::init()
+{
+  if (!first) return;
+
+  // determine status of neighbor flag of the omp package command
+  int ifix = modify->find_fix("package_omp");
+  int use_omp = 0;
+  if (ifix >=0) {
+     FixOMP * fix = static_cast<FixOMP *>(lmp->modify->fix[ifix]);
+     if (fix->get_neighbor()) use_omp = 1;
+  }
+
+  // need a full neighbor list once
+
+  int irequest = neighbor->request((void *) this);
+  neighbor->requests[irequest]->pair = 0;
+  neighbor->requests[irequest]->fix  = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->omp = use_omp;
+  neighbor->requests[irequest]->occasional = 1;
+}
diff --git a/src/USER-OMP/fix_peri_neigh_omp.h b/src/USER-OMP/fix_peri_neigh_omp.h
new file mode 100644
index 0000000000..6d91f39be1
--- /dev/null
+++ b/src/USER-OMP/fix_peri_neigh_omp.h
@@ -0,0 +1,38 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(PERI_NEIGH_OMP,FixPeriNeighOMP)
+
+#else
+
+#ifndef LMP_FIX_PERI_NEIGH_OMP_H
+#define LMP_FIX_PERI_NEIGH_OMP_H
+
+#include "fix_peri_neigh.h"
+
+namespace LAMMPS_NS {
+
+class FixPeriNeighOMP : public FixPeriNeigh {
+
+ public:
+  FixPeriNeighOMP(class LAMMPS *lmp, int narg, char **argv) :
+    FixPeriNeigh(lmp,narg,argv) {};
+  virtual void init();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/fix_qeq_comb_omp.cpp b/src/USER-OMP/fix_qeq_comb_omp.cpp
new file mode 100644
index 0000000000..18b9b5305c
--- /dev/null
+++ b/src/USER-OMP/fix_qeq_comb_omp.cpp
@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "lmptype.h"
+#include "mpi.h"
+#include "math.h"
+#include "fix_qeq_comb_omp.h"
+#include "fix_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "modify.h"
+#include "error.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "respa.h"
+#include "update.h"
+#include "pair_comb_omp.h"
+
+#include <string.h>
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixQEQCombOMP::FixQEQCombOMP(LAMMPS *lmp, int narg, char **arg)
+  : FixQEQComb(lmp, narg, arg)
+{
+  if (narg < 5) error->all(FLERR,"Illegal fix qeq/comb/omp command");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixQEQCombOMP::init()
+{
+  if (!atom->q_flag)
+    error->all(FLERR,"Fix qeq/comb/omp requires atom attribute q");
+
+  comb = (PairComb *) force->pair_match("comb/omp",1);
+  if (comb == NULL)
+    comb = (PairComb *) force->pair_match("comb",1);
+  if (comb == NULL) error->all(FLERR,"Must use pair_style comb or comb/omp with fix qeq/comb");
+
+  if (strstr(update->integrate_style,"respa"))
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+
+  ngroup = group->count(igroup);
+  if (ngroup == 0) error->all(FLERR,"Fix qeq/comb group has no atoms");
+
+  // determine status of neighbor flag of the omp package command
+  int ifix = modify->find_fix("package_omp");
+  int use_omp = 0;
+  if (ifix >=0) {
+     FixOMP * fix = static_cast<FixOMP *>(lmp->modify->fix[ifix]);
+     if (fix->get_neighbor()) use_omp = 1;
+  }
+
+  int irequest = neighbor->request(this);
+  neighbor->requests[irequest]->pair = 0;
+  neighbor->requests[irequest]->fix = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->omp = use_omp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixQEQCombOMP::post_force(int vflag)
+{
+  int i,ii,iloop,loopmax,inum,*ilist;
+  double heatpq,qmass,dtq,dtq2;
+  double enegchkall,enegmaxall;
+
+  if (update->ntimestep % nevery) return;
+
+  // reallocate work arrays if necessary
+  // qf = charge force
+  // q1 = charge displacement
+  // q2 = tmp storage of charge force for next iteration
+
+  if (atom->nmax > nmax) {
+    memory->destroy(qf);
+    memory->destroy(q1);
+    memory->destroy(q2);
+    nmax = atom->nmax;
+    memory->create(qf,nmax,"qeq:qf");
+    memory->create(q1,nmax,"qeq:q1");
+    memory->create(q2,nmax,"qeq:q2");
+    vector_atom = qf;
+  }
+
+  // more loops for first-time charge equilibrium
+
+  iloop = 0;
+  if (firstflag) loopmax = 500;
+  else loopmax = 200;
+
+  // charge-equilibration loop
+
+  if (me == 0 && fp)
+    fprintf(fp,"Charge equilibration on step " BIGINT_FORMAT "\n",
+            update->ntimestep);
+
+  heatpq = 0.05;
+  qmass  = 0.016;
+  dtq    = 0.01;
+  dtq2   = 0.5*dtq*dtq/qmass;
+
+  double enegchk = 0.0;
+  double enegtot = 0.0;
+  double enegmax = 0.0;
+
+  double *q = atom->q;
+  int *mask = atom->mask;
+
+  inum = comb->list->inum;
+  ilist = comb->list->ilist;
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    q1[i] = q2[i] = qf[i] = 0.0;
+  }
+
+  for (iloop = 0; iloop < loopmax; iloop ++ ) {
+    for (ii = 0; ii < inum; ii++) {
+      i = ilist[ii];
+      if (mask[i] & groupbit) {
+        q1[i] += qf[i]*dtq2 - heatpq*q1[i];
+        q[i]  += q1[i];
+      }
+    }
+    comm->forward_comm_fix(this);
+
+    if(comb) enegtot = comb->yasu_char(qf,igroup);
+    enegtot /= ngroup;
+    enegchk = enegmax = 0.0;
+
+    for (ii = 0; ii < inum ; ii++) {
+      i = ilist[ii];
+      if (mask[i] & groupbit) {
+        q2[i] = enegtot-qf[i];
+        enegmax = MAX(enegmax,fabs(q2[i]));
+        enegchk += fabs(q2[i]);
+        qf[i] = q2[i];
+      }
+    }
+
+    MPI_Allreduce(&enegchk,&enegchkall,1,MPI_DOUBLE,MPI_SUM,world);
+    enegchk = enegchkall/ngroup;
+    MPI_Allreduce(&enegmax,&enegmaxall,1,MPI_DOUBLE,MPI_MAX,world);
+    enegmax = enegmaxall;
+
+    if (enegchk <= precision && enegmax <= 100.0*precision) break;
+
+    if (me == 0 && fp)
+      fprintf(fp,"  iteration: %d, enegtot %.6g, "
+              "enegmax %.6g, fq deviation: %.6g\n",
+              iloop,enegtot,enegmax,enegchk);
+
+    for (ii = 0; ii < inum; ii++) {
+      i = ilist[ii];
+      if (mask[i] & groupbit)
+        q1[i] += qf[i]*dtq2 - heatpq*q1[i];
+    }
+  }
+
+  if (me == 0 && fp) {
+    if (iloop == loopmax)
+      fprintf(fp,"Charges did not converge in %d iterations\n",iloop);
+    else
+      fprintf(fp,"Charges converged in %d iterations to %.10f tolerance\n",
+              iloop,enegchk);
+  }
+}
diff --git a/src/USER-OMP/fix_qeq_comb_omp.h b/src/USER-OMP/fix_qeq_comb_omp.h
new file mode 100644
index 0000000000..0febe6b0aa
--- /dev/null
+++ b/src/USER-OMP/fix_qeq_comb_omp.h
@@ -0,0 +1,32 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(qeq/comb/omp,FixQEQCombOMP)
+
+#else
+
+#ifndef LMP_FIX_QEQ_COMB_OMP_H
+#define LMP_FIX_QEQ_COMB_OMP_H
+
+#include "fix_qeq_comb.h"
+
+namespace LAMMPS_NS {
+
+class FixQEQCombOMP : public FixQEQComb {
+ public:
+  FixQEQCombOMP(class LAMMPS *, int, char **);
+  virtual void init();
+  virtual void post_force(int);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/fix_shear_history_omp.cpp b/src/USER-OMP/fix_shear_history_omp.cpp
new file mode 100644
index 0000000000..d410f6363c
--- /dev/null
+++ b/src/USER-OMP/fix_shear_history_omp.cpp
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "stdio.h"
+#include "fix_shear_history_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "modify.h"
+#include "error.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define MAXTOUCH 15
+
+/* ----------------------------------------------------------------------
+   copy shear partner info from neighbor lists to atom arrays
+   so can be exchanged with atoms
+------------------------------------------------------------------------- */
+
+void FixShearHistoryOMP::pre_exchange()
+{
+
+  const int nlocal = atom->nlocal;
+  const int nghost = atom->nghost;
+  const int nall = nlocal + nghost;
+  const int nthreads = comm->nthreads;
+
+  int flag = 0;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(flag)
+#endif
+  {
+
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    // each thread works on a fixed chunk of local and ghost atoms.
+    const int ldelta = 1 + nlocal/nthreads;
+    const int lfrom = tid*ldelta;
+    const int lmax = lfrom +ldelta;
+    const int lto = (lmax > nlocal) ? nlocal : lmax;
+
+    const int gdelta = 1 + nghost/nthreads;
+    const int gfrom = nlocal + tid*gdelta;
+    const int gmax = gfrom + gdelta;
+    const int gto = (gmax > nall) ? nall : gmax;
+
+
+    int i,j,ii,jj,m,inum,jnum;
+    int *ilist,*jlist,*numneigh,**firstneigh;
+    int *touch,**firsttouch;
+    double *shear,*allshear,**firstshear;
+
+    // zero npartners for all current atoms
+
+    for (i = lfrom; i < lto; i++) npartner[i] = 0;
+
+    // copy shear info from neighbor list atoms to atom arrays
+
+    int *tag = atom->tag;
+
+    NeighList *list = pair->list;
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    firsttouch = list->listgranhistory->firstneigh;
+    firstshear = list->listgranhistory->firstdouble;
+
+    for (ii = 0; ii < inum; ii++) {
+      i = ilist[ii];
+      jlist = firstneigh[i];
+      allshear = firstshear[i];
+      jnum = numneigh[i];
+      touch = firsttouch[i];
+
+      for (jj = 0; jj < jnum; jj++) {
+        if (touch[jj]) {
+          j = jlist[jj];
+          j &= NEIGHMASK;
+          shear = &allshear[3*jj];
+
+          if ((i >= lfrom) && (i < lto)) {
+            if (npartner[i] < MAXTOUCH) {
+              m = npartner[i];
+              partner[i][m] = tag[j];
+              shearpartner[i][m][0] = shear[0];
+              shearpartner[i][m][1] = shear[1];
+              shearpartner[i][m][2] = shear[2];
+            }
+            npartner[i]++;
+          }
+
+          if ((j >= lfrom) && (j < lto)) {
+            if (npartner[j] < MAXTOUCH) {
+              m = npartner[j];
+              partner[j][m] = tag[i];
+              shearpartner[j][m][0] = -shear[0];
+              shearpartner[j][m][1] = -shear[1];
+              shearpartner[j][m][2] = -shear[2];
+            }
+            npartner[j]++;
+          }
+
+          if ((j >= gfrom) && (j < gto)) {
+            npartner[j]++;
+          }
+        }
+      }
+    }
+
+    // test for too many touching neighbors
+    int myflag = 0;
+    for (i = lfrom; i < lto; i++)
+      if (npartner[i] >= MAXTOUCH) myflag = 1;
+
+    if (myflag)
+#if defined(_OPENMP)
+#pragma omp atomic
+#endif
+      ++flag;
+  }
+
+  int flag_all;
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all)
+    error->all(FLERR,"Too many touching neighbors - boost MAXTOUCH");
+}
diff --git a/src/USER-OMP/fix_shear_history_omp.h b/src/USER-OMP/fix_shear_history_omp.h
new file mode 100644
index 0000000000..aa581ff6a7
--- /dev/null
+++ b/src/USER-OMP/fix_shear_history_omp.h
@@ -0,0 +1,38 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(SHEAR_HISTORY/omp,FixShearHistoryOMP)
+
+#else
+
+#ifndef LMP_FIX_SHEAR_HISTORY_OMP_H
+#define LMP_FIX_SHEAR_HISTORY_OMP_H
+
+#include "fix_shear_history.h"
+
+namespace LAMMPS_NS {
+
+class FixShearHistoryOMP : public FixShearHistory {
+
+ public:
+  FixShearHistoryOMP(class LAMMPS *lmp, int narg, char **argv)
+    : FixShearHistory(lmp,narg,argv) {};
+  virtual void pre_exchange();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/fix_wall_gran_omp.cpp b/src/USER-OMP/fix_wall_gran_omp.cpp
new file mode 100644
index 0000000000..e131acc0fa
--- /dev/null
+++ b/src/USER-OMP/fix_wall_gran_omp.cpp
@@ -0,0 +1,148 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "fix_wall_gran_omp.h"
+#include "atom.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+enum{XPLANE=0,YPLANE=1,ZPLANE=2,ZCYLINDER};    // XYZ PLANE need to be 0,1,2
+enum{HOOKE,HOOKE_HISTORY,HERTZ_HISTORY};
+
+#define BIG 1.0e20
+
+/* ---------------------------------------------------------------------- */
+
+FixWallGranOMP::FixWallGranOMP(LAMMPS *lmp, int narg, char **arg) :
+  FixWallGran(lmp, narg, arg) { }
+
+/* ---------------------------------------------------------------------- */
+
+void FixWallGranOMP::post_force(int vflag)
+{
+  double vwall[3];
+
+  // set position of wall to initial settings and velocity to 0.0
+  // if wiggle or shear, set wall position and velocity accordingly
+
+  double wlo = lo;
+  double whi = hi;
+  vwall[0] = vwall[1] = vwall[2] = 0.0;
+  if (wiggle) {
+    double arg = omega * (update->ntimestep - time_origin) * dt;
+    if (wallstyle == axis) {
+      wlo = lo + amplitude - amplitude*cos(arg);
+      whi = hi + amplitude - amplitude*cos(arg);
+    }
+    vwall[axis] = amplitude*omega*sin(arg);
+  } else if (wshear) vwall[axis] = vshear;
+
+  // loop over all my atoms
+  // rsq = distance from wall
+  // dx,dy,dz = signed distance from wall
+  // for rotating cylinder, reset vwall based on particle position
+  // skip atom if not close enough to wall
+  //   if wall was set to NULL, it's skipped since lo/hi are infinity
+  // compute force and torque on atom if close enough to wall
+  //   via wall potential matched to pair potential
+  // set shear if pair potential stores history
+
+  double * const * const x = atom->x;
+  double * const * const v = atom->v;
+  double * const * const f = atom->f;
+  double * const * const omega = atom->omega;
+  double * const * const torque = atom->torque;
+  double * const radius = atom->radius;
+  double * const rmass = atom->rmass;
+  const int * const mask = atom->mask;
+  const int nlocal = atom->nlocal;
+
+  shearupdate = (update->setupflag) ? 0 : 1;
+
+  int i;
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none) firstprivate(vwall,wlo,whi)
+#endif
+  for (i = 0; i < nlocal; i++) {
+
+    if (mask[i] & groupbit) {
+      double dx,dy,dz,del1,del2,delxy,delr,rsq;
+
+      dx = dy = dz = 0.0;
+
+      if (wallstyle == XPLANE) {
+        del1 = x[i][0] - wlo;
+        del2 = whi - x[i][0];
+        if (del1 < del2) dx = del1;
+        else dx = -del2;
+      } else if (wallstyle == YPLANE) {
+        del1 = x[i][1] - wlo;
+        del2 = whi - x[i][1];
+        if (del1 < del2) dy = del1;
+        else dy = -del2;
+      } else if (wallstyle == ZPLANE) {
+        del1 = x[i][2] - wlo;
+        del2 = whi - x[i][2];
+        if (del1 < del2) dz = del1;
+        else dz = -del2;
+      } else if (wallstyle == ZCYLINDER) {
+        delxy = sqrt(x[i][0]*x[i][0] + x[i][1]*x[i][1]);
+        delr = cylradius - delxy;
+        if (delr > radius[i]) dz = cylradius;
+        else {
+          dx = -delr/delxy * x[i][0];
+          dy = -delr/delxy * x[i][1];
+          if (wshear && axis != 2) {
+            vwall[0] = vshear * x[i][1]/delxy;
+            vwall[1] = -vshear * x[i][0]/delxy;
+            vwall[2] = 0.0;
+          }
+        }
+      }
+
+      rsq = dx*dx + dy*dy + dz*dz;
+
+      if (rsq > radius[i]*radius[i]) {
+        if (pairstyle != HOOKE) {
+          shear[i][0] = 0.0;
+          shear[i][1] = 0.0;
+          shear[i][2] = 0.0;
+        }
+      } else {
+        if (pairstyle == HOOKE)
+          hooke(rsq,dx,dy,dz,vwall,v[i],f[i],omega[i],torque[i],
+                radius[i],rmass[i]);
+        else if (pairstyle == HOOKE_HISTORY)
+          hooke_history(rsq,dx,dy,dz,vwall,v[i],f[i],omega[i],torque[i],
+                        radius[i],rmass[i],shear[i]);
+        else if (pairstyle == HERTZ_HISTORY)
+          hertz_history(rsq,dx,dy,dz,vwall,v[i],f[i],omega[i],torque[i],
+                        radius[i],rmass[i],shear[i]);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixWallGranOMP::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+}
diff --git a/src/USER-OMP/fix_wall_gran_omp.h b/src/USER-OMP/fix_wall_gran_omp.h
new file mode 100644
index 0000000000..cd26a7b40a
--- /dev/null
+++ b/src/USER-OMP/fix_wall_gran_omp.h
@@ -0,0 +1,38 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(wall/gran/omp,FixWallGranOMP)
+
+#else
+
+#ifndef LMP_FIX_WALL_GRAN_OMP_H
+#define LMP_FIX_WALL_GRAN_OMP_H
+
+#include "fix_wall_gran.h"
+
+namespace LAMMPS_NS {
+
+class FixWallGranOMP : public FixWallGran {
+
+ public:
+  FixWallGranOMP(class LAMMPS *, int, char **);
+  virtual void post_force(int);
+  virtual void post_force_respa(int, int, int);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/hack_openmp_for_pgi.sh b/src/USER-OMP/hack_openmp_for_pgi.sh
new file mode 100644
index 0000000000..a076fd1fcb
--- /dev/null
+++ b/src/USER-OMP/hack_openmp_for_pgi.sh
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+for f in *.h *.cpp
+do \
+   sed -e '/#pragma omp/s/^\(.*default\)(none)\(.*\)$/\1(shared)\2/' \
+       -e '/#pragma omp/s/shared([a-z0-9,_]\+)//' \
+       -i.bak $f
+done
diff --git a/src/USER-OMP/improper_class2_omp.cpp b/src/USER-OMP/improper_class2_omp.cpp
new file mode 100644
index 0000000000..08f4a21e00
--- /dev/null
+++ b/src/USER-OMP/improper_class2_omp.cpp
@@ -0,0 +1,698 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "improper_class2_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+ImproperClass2OMP::ImproperClass2OMP(class LAMMPS *lmp)
+  : ImproperClass2(lmp), ThrOMP(lmp,THR_IMPROPER)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperClass2OMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nimproperlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperClass2OMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,i,j,k,n,type;
+  double eimproper;
+  double delr[3][3],rmag[3],rinvmag[3],rmag2[3];
+  double theta[3],costheta[3],sintheta[3];
+  double cossqtheta[3],sinsqtheta[3],invstheta[3];
+  double rABxrCB[3],rDBxrAB[3],rCBxrDB[3];
+  double ddelr[3][4],dr[3][4][3],dinvr[3][4][3];
+  double dthetadr[3][4][3],dinvsth[3][4][3];
+  double dinv3r[4][3],dinvs3r[3][4][3];
+  double drCBxrDB[3],rCBxdrDB[3],drDBxrAB[3],rDBxdrAB[3];
+  double drABxrCB[3],rABxdrCB[3];
+  double dot1,dot2,dd[3];
+  double fdot[3][4][3],ftmp,invs3r[3],inv3r;
+  double t,tt1,tt3,sc1;
+  double dotCBDBAB,dotDBABCB,dotABCBDB;
+  double chi,deltachi,d2chi,cossin2;
+  double drAB[3][4][3],drCB[3][4][3],drDB[3][4][3];
+  double dchi[3][4][3],dtotalchi[4][3];
+  double schiABCD,chiABCD,schiCBDA,chiCBDA,schiDBAC,chiDBAC;
+  double fabcd[4][3];
+
+  eimproper = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const improperlist = neighbor->improperlist;
+  const int nlocal = atom->nlocal;
+
+  for (i = 0; i < 3; i++)
+    for (j = 0; j < 4; j++)
+      for (k = 0; k < 3; k++) {
+        dthetadr[i][j][k] = 0.0;
+        drAB[i][j][k] = 0.0;
+        drCB[i][j][k] = 0.0;
+        drDB[i][j][k] = 0.0;
+      }
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = improperlist[n][0];
+    i2 = improperlist[n][1];
+    i3 = improperlist[n][2];
+    i4 = improperlist[n][3];
+    type = improperlist[n][4];
+
+    if (k0[type] == 0.0) continue;
+
+    // difference vectors
+
+    delr[0][0] = x[i1][0] - x[i2][0];
+    delr[0][1] = x[i1][1] - x[i2][1];
+    delr[0][2] = x[i1][2] - x[i2][2];
+
+    delr[1][0] = x[i3][0] - x[i2][0];
+    delr[1][1] = x[i3][1] - x[i2][1];
+    delr[1][2] = x[i3][2] - x[i2][2];
+
+    delr[2][0] = x[i4][0] - x[i2][0];
+    delr[2][1] = x[i4][1] - x[i2][1];
+    delr[2][2] = x[i4][2] - x[i2][2];
+
+    // bond lengths and associated values
+
+    for (i = 0; i < 3; i++) {
+      rmag2[i] = delr[i][0]*delr[i][0] + delr[i][1]*delr[i][1] +
+        delr[i][2]*delr[i][2];
+      rmag[i] = sqrt(rmag2[i]);
+      rinvmag[i] = 1.0/rmag[i];
+    }
+
+    // angle ABC, CBD, ABD
+
+    costheta[0] = (delr[0][0]*delr[1][0] + delr[0][1]*delr[1][1] +
+                   delr[0][2]*delr[1][2]) / (rmag[0]*rmag[1]);
+    costheta[1] = (delr[1][0]*delr[2][0] + delr[1][1]*delr[2][1] +
+                   delr[1][2]*delr[2][2]) / (rmag[1]*rmag[2]);
+    costheta[2] = (delr[0][0]*delr[2][0] + delr[0][1]*delr[2][1] +
+                   delr[0][2]*delr[2][2]) / (rmag[0]*rmag[2]);
+
+    // angle error check
+
+    for (i = 0; i < 3; i++) {
+      if (costheta[i] == -1.0) {
+        int me = comm->me;
+        if (screen) {
+          char str[128];
+          sprintf(str,
+                  "Improper problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                  me, thr->get_tid(),update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1][0],x[i1][1],x[i1][2]);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2][0],x[i2][1],x[i2][2]);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3][0],x[i3][1],x[i3][2]);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4][0],x[i4][1],x[i4][2]);
+        }
+      }
+    }
+
+    for (i = 0; i < 3; i++) {
+      if (costheta[i] > 1.0)  costheta[i] = 1.0;
+      if (costheta[i] < -1.0) costheta[i] = -1.0;
+      theta[i] = acos(costheta[i]);
+      cossqtheta[i] = costheta[i]*costheta[i];
+      sintheta[i] = sin(theta[i]);
+      invstheta[i] = 1.0/sintheta[i];
+      sinsqtheta[i] = sintheta[i]*sintheta[i];
+    }
+
+    // cross & dot products
+
+    cross(delr[0],delr[1],rABxrCB);
+    cross(delr[2],delr[0],rDBxrAB);
+    cross(delr[1],delr[2],rCBxrDB);
+
+    dotCBDBAB = dot(rCBxrDB,delr[0]);
+    dotDBABCB = dot(rDBxrAB,delr[1]);
+    dotABCBDB = dot(rABxrCB,delr[2]);
+
+    t = rmag[0] * rmag[1] * rmag[2];
+    inv3r = 1.0/t;
+    invs3r[0] = invstheta[1] * inv3r;
+    invs3r[1] = invstheta[2] * inv3r;
+    invs3r[2] = invstheta[0] * inv3r;
+
+    // chi ABCD, CBDA, DBAC
+    // final chi is average of three
+
+    schiABCD = dotCBDBAB * invs3r[0];
+    chiABCD = asin(schiABCD);
+    schiCBDA = dotDBABCB * invs3r[1];
+    chiCBDA = asin(schiCBDA);
+    schiDBAC = dotABCBDB * invs3r[2];
+    chiDBAC = asin(schiDBAC);
+
+    chi = (chiABCD + chiCBDA + chiDBAC) / 3.0;
+    deltachi = chi - chi0[type];
+    d2chi = deltachi * deltachi;
+
+    // energy
+
+    if (EFLAG) eimproper = k0[type]*d2chi;
+
+    // forces
+    // define d(delr)
+    // i = bond AB/CB/DB, j = atom A/B/C/D
+
+    ddelr[0][0] = 1.0;
+    ddelr[0][1] = -1.0;
+    ddelr[0][2] = 0.0;
+    ddelr[0][3] = 0.0;
+    ddelr[1][0] = 0.0;
+    ddelr[1][1] = -1.0;
+    ddelr[1][2] = 1.0;
+    ddelr[1][3] = 0.0;
+    ddelr[2][0] = 0.0;
+    ddelr[2][1] = -1.0;
+    ddelr[2][2] = 0.0;
+    ddelr[2][3] = 1.0;
+
+    // compute d(|r|)/dr and d(1/|r|)/dr for each direction, bond and atom
+    // define d(r) for each r
+    // i = bond AB/CB/DB, j = atom A/B/C/D, k = X/Y/Z
+
+    for (i = 0; i < 3; i++)
+      for (j = 0; j < 4; j++)
+        for (k = 0; k < 3; k++) {
+          dr[i][j][k] = delr[i][k] * ddelr[i][j] / rmag[i];
+          dinvr[i][j][k] = -dr[i][j][k] / rmag2[i];
+        }
+
+    // compute d(1 / (|r_AB| * |r_CB| * |r_DB|) / dr
+    // i = atom A/B/C/D, j = X/Y/Z
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        dinv3r[i][j] = rinvmag[1] * (rinvmag[2] * dinvr[0][i][j] +
+                                     rinvmag[0] * dinvr[2][i][j]) +
+          rinvmag[2] * rinvmag[0] * dinvr[1][i][j];
+
+    // compute d(theta)/d(r) for 3 angles
+    // angleABC
+
+    tt1 = costheta[0] / rmag2[0];
+    tt3 = costheta[0] / rmag2[1];
+    sc1 = 1.0 / sqrt(1.0 - cossqtheta[0]);
+
+    dthetadr[0][0][0] = sc1 * ((tt1 * delr[0][0]) -
+                               (delr[1][0] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][0][1] = sc1 * ((tt1 * delr[0][1]) -
+                               (delr[1][1] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][0][2] = sc1 * ((tt1 * delr[0][2]) -
+                               (delr[1][2] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][1][0] = -sc1 * ((tt1 * delr[0][0]) -
+                                (delr[1][0] * rinvmag[0] * rinvmag[1]) +
+                                (tt3 * delr[1][0]) -
+                                (delr[0][0] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][1][1] = -sc1 * ((tt1 * delr[0][1]) -
+                                (delr[1][1] * rinvmag[0] * rinvmag[1]) +
+                                (tt3 * delr[1][1]) -
+                                (delr[0][1] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][1][2] = -sc1 * ((tt1 * delr[0][2]) -
+                                (delr[1][2] * rinvmag[0] * rinvmag[1]) +
+                                (tt3 * delr[1][2]) -
+                                (delr[0][2] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][2][0] = sc1 * ((tt3 * delr[1][0]) -
+                               (delr[0][0] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][2][1] = sc1 * ((tt3 * delr[1][1]) -
+                               (delr[0][1] * rinvmag[0] * rinvmag[1]));
+    dthetadr[0][2][2] = sc1 * ((tt3 * delr[1][2]) -
+                               (delr[0][2] * rinvmag[0] * rinvmag[1]));
+
+    // angleCBD
+
+    tt1 = costheta[1] / rmag2[1];
+    tt3 = costheta[1] / rmag2[2];
+    sc1 = 1.0 / sqrt(1.0 - cossqtheta[1]);
+
+    dthetadr[1][2][0] = sc1 * ((tt1 * delr[1][0]) -
+                               (delr[2][0] * rinvmag[1] * rinvmag[2]));
+    dthetadr[1][2][1] = sc1 * ((tt1 * delr[1][1]) -
+                               (delr[2][1] * rinvmag[1] * rinvmag[2]));
+    dthetadr[1][2][2] = sc1 * ((tt1 * delr[1][2]) -
+                               (delr[2][2] * rinvmag[1] * rinvmag[2]));
+    dthetadr[1][1][0] = -sc1 * ((tt1 * delr[1][0]) -
+                                (delr[2][0] * rinvmag[1] * rinvmag[2]) +
+                                (tt3 * delr[2][0]) -
+                                (delr[1][0] * rinvmag[2] * rinvmag[1]));
+    dthetadr[1][1][1] = -sc1 * ((tt1 * delr[1][1]) -
+                                (delr[2][1] * rinvmag[1] * rinvmag[2]) +
+                                (tt3 * delr[2][1]) -
+                                (delr[1][1] * rinvmag[2] * rinvmag[1]));
+    dthetadr[1][1][2] = -sc1 * ((tt1 * delr[1][2]) -
+                                (delr[2][2] * rinvmag[1] * rinvmag[2]) +
+                                (tt3 * delr[2][2]) -
+                                (delr[1][2] * rinvmag[2] * rinvmag[1]));
+    dthetadr[1][3][0] = sc1 * ((tt3 * delr[2][0]) -
+                               (delr[1][0] * rinvmag[2] * rinvmag[1]));
+    dthetadr[1][3][1] = sc1 * ((tt3 * delr[2][1]) -
+                               (delr[1][1] * rinvmag[2] * rinvmag[1]));
+    dthetadr[1][3][2] = sc1 * ((tt3 * delr[2][2]) -
+                               (delr[1][2] * rinvmag[2] * rinvmag[1]));
+
+    // angleABD
+
+    tt1 = costheta[2] / rmag2[0];
+    tt3 = costheta[2] / rmag2[2];
+    sc1 = 1.0 / sqrt(1.0 - cossqtheta[2]);
+
+    dthetadr[2][0][0] = sc1 * ((tt1 * delr[0][0]) -
+                               (delr[2][0] * rinvmag[0] * rinvmag[2]));
+    dthetadr[2][0][1] = sc1 * ((tt1 * delr[0][1]) -
+                               (delr[2][1] * rinvmag[0] * rinvmag[2]));
+    dthetadr[2][0][2] = sc1 * ((tt1 * delr[0][2]) -
+                               (delr[2][2] * rinvmag[0] * rinvmag[2]));
+    dthetadr[2][1][0] = -sc1 * ((tt1 * delr[0][0]) -
+                                (delr[2][0] * rinvmag[0] * rinvmag[2]) +
+                                (tt3 * delr[2][0]) -
+                                (delr[0][0] * rinvmag[2] * rinvmag[0]));
+    dthetadr[2][1][1] = -sc1 * ((tt1 * delr[0][1]) -
+                                (delr[2][1] * rinvmag[0] * rinvmag[2]) +
+                                (tt3 * delr[2][1]) -
+                                (delr[0][1] * rinvmag[2] * rinvmag[0]));
+    dthetadr[2][1][2] = -sc1 * ((tt1 * delr[0][2]) -
+                                (delr[2][2] * rinvmag[0] * rinvmag[2]) +
+                                (tt3 * delr[2][2]) -
+                                (delr[0][2] * rinvmag[2] * rinvmag[0]));
+    dthetadr[2][3][0] = sc1 * ((tt3 * delr[2][0]) -
+                               (delr[0][0] * rinvmag[2] * rinvmag[0]));
+    dthetadr[2][3][1] = sc1 * ((tt3 * delr[2][1]) -
+                               (delr[0][1] * rinvmag[2] * rinvmag[0]));
+    dthetadr[2][3][2] = sc1 * ((tt3 * delr[2][2]) -
+                               (delr[0][2] * rinvmag[2] * rinvmag[0]));
+
+    // compute d( 1 / sin(theta))/dr
+    // i = angle, j = atom, k = direction
+
+    for (i = 0; i < 3; i++) {
+      cossin2 = -costheta[i] / sinsqtheta[i];
+      for (j = 0; j < 4; j++)
+        for (k = 0; k < 3; k++)
+          dinvsth[i][j][k] = cossin2 * dthetadr[i][j][k];
+    }
+
+    // compute d(1 / sin(theta) * |r_AB| * |r_CB| * |r_DB|)/dr
+    // i = angle, j = atom
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++) {
+        dinvs3r[0][i][j] = (invstheta[1] * dinv3r[i][j]) +
+          (inv3r * dinvsth[1][i][j]);
+        dinvs3r[1][i][j] = (invstheta[2] * dinv3r[i][j]) +
+          (inv3r * dinvsth[2][i][j]);
+        dinvs3r[2][i][j] = (invstheta[0] * dinv3r[i][j]) +
+          (inv3r * dinvsth[0][i][j]);
+      }
+
+    // drCB(i,j,k), etc
+    // i = vector X'/Y'/Z', j = atom A/B/C/D, k = direction X/Y/Z
+
+    for (i = 0; i < 3; i++) {
+      drCB[i][1][i] = -1.0;
+      drAB[i][1][i] = -1.0;
+      drDB[i][1][i] = -1.0;
+      drDB[i][3][i] = 1.0;
+      drCB[i][2][i] = 1.0;
+      drAB[i][0][i] = 1.0;
+    }
+
+    // d((r_CB x r_DB) dot r_AB)
+    // r_CB x d(r_DB)
+    // d(r_CB) x r_DB
+    // (r_CB x d(r_DB)) + (d(r_CB) x r_DB)
+    // (r_CB x d(r_DB)) + (d(r_CB) x r_DB) dot r_AB
+    // d(r_AB) dot (r_CB x r_DB)
+
+    for (i = 0; i < 3; i++)
+      for (j = 0; j < 4; j++) {
+        cross(delr[1],drDB[i][j],rCBxdrDB);
+        cross(drCB[i][j],delr[2],drCBxrDB);
+        for (k = 0; k < 3; k++) dd[k] = rCBxdrDB[k] + drCBxrDB[k];
+        dot1 = dot(dd,delr[0]);
+        dot2 = dot(rCBxrDB,drAB[i][j]);
+        fdot[0][j][i] = dot1 + dot2;
+      }
+
+    // d((r_DB x r_AB) dot r_CB)
+    // r_DB x d(r_AB)
+    // d(r_DB) x r_AB
+    // (r_DB x d(r_AB)) + (d(r_DB) x r_AB)
+    // (r_DB x d(r_AB)) + (d(r_DB) x r_AB) dot r_CB
+    // d(r_CB) dot (r_DB x r_AB)
+
+    for (i = 0; i < 3; i++)
+      for (j = 0; j < 4; j++) {
+        cross(delr[2],drAB[i][j],rDBxdrAB);
+        cross(drDB[i][j],delr[0],drDBxrAB);
+        for (k = 0; k < 3; k++) dd[k] = rDBxdrAB[k] + drDBxrAB[k];
+        dot1 = dot(dd,delr[1]);
+        dot2 = dot(rDBxrAB,drCB[i][j]);
+        fdot[1][j][i] = dot1 + dot2;
+      }
+
+    // d((r_AB x r_CB) dot r_DB)
+    // r_AB x d(r_CB)
+    // d(r_AB) x r_CB
+    // (r_AB x d(r_CB)) + (d(r_AB) x r_CB)
+    // (r_AB x d(r_CB)) + (d(r_AB) x r_CB) dot r_DB
+    // d(r_DB) dot (r_AB x r_CB)
+
+    for (i = 0; i < 3; i++)
+      for (j = 0; j < 4; j++) {
+        cross(delr[0],drCB[i][j],rABxdrCB);
+        cross(drAB[i][j],delr[1],drABxrCB);
+        for (k = 0; k < 3; k++) dd[k] = rABxdrCB[k] + drABxrCB[k];
+        dot1 = dot(dd,delr[2]);
+        dot2 = dot(rABxrCB,drDB[i][j]);
+        fdot[2][j][i] = dot1 + dot2;
+      }
+
+    // force on each atom
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++) {
+        ftmp = (fdot[0][i][j] * invs3r[0]) +
+          (dinvs3r[0][i][j] * dotCBDBAB);
+        dchi[0][i][j] = ftmp / cos(chiABCD);
+        ftmp = (fdot[1][i][j] * invs3r[1]) +
+          (dinvs3r[1][i][j] * dotDBABCB);
+        dchi[1][i][j] = ftmp / cos(chiCBDA);
+        ftmp = (fdot[2][i][j] * invs3r[2]) +
+          (dinvs3r[2][i][j] * dotABCBDB);
+        dchi[2][i][j] = ftmp / cos(chiDBAC);
+        dtotalchi[i][j] = (dchi[0][i][j]+dchi[1][i][j]+dchi[2][i][j]) / 3.0;
+      }
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] = -2.0*k0[type] * deltachi*dtotalchi[i][j];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += fabcd[0][0];
+      f[i1][1] += fabcd[0][1];
+      f[i1][2] += fabcd[0][2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += fabcd[1][0];
+      f[i2][1] += fabcd[1][1];
+      f[i2][2] += fabcd[1][2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += fabcd[2][0];
+      f[i3][1] += fabcd[2][1];
+      f[i3][2] += fabcd[2][2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += fabcd[3][0];
+      f[i4][1] += fabcd[3][1];
+      f[i4][2] += fabcd[3][2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,
+                   fabcd[0],fabcd[2],fabcd[3],
+                   delr[0][0],delr[0][1],delr[0][2],
+                   delr[1][0],delr[1][1],delr[1][2],
+                   delr[2][0]-delr[1][0],delr[2][1]-delr[1][1],
+                   delr[2][2]-delr[1][2],thr);
+  }
+
+  // compute angle-angle interactions
+  angleangle_thr<EVFLAG, EFLAG, NEWTON_BOND>(nfrom,nto,thr);
+}
+
+/* ----------------------------------------------------------------------
+   angle-angle interactions within improper
+------------------------------------------------------------------------- */
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperClass2OMP::angleangle_thr(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,i,j,k,n,type;
+  double eimproper;
+  double delxAB,delyAB,delzAB,rABmag2,rAB;
+  double delxBC,delyBC,delzBC,rBCmag2,rBC;
+  double delxBD,delyBD,delzBD,rBDmag2,rBD;
+  double costhABC,thetaABC,costhABD;
+  double thetaABD,costhCBD,thetaCBD,dthABC,dthCBD,dthABD;
+  double sc1,t1,t3,r12;
+  double dthetadr[3][4][3],fabcd[4][3];
+
+  eimproper = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const improperlist = neighbor->improperlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = improperlist[n][0];
+    i2 = improperlist[n][1];
+    i3 = improperlist[n][2];
+    i4 = improperlist[n][3];
+    type = improperlist[n][4];
+
+    // difference vectors
+
+    delxAB = x[i1][0] - x[i2][0];
+    delyAB = x[i1][1] - x[i2][1];
+    delzAB = x[i1][2] - x[i2][2];
+
+    delxBC = x[i3][0] - x[i2][0];
+    delyBC = x[i3][1] - x[i2][1];
+    delzBC = x[i3][2] - x[i2][2];
+
+    delxBD = x[i4][0] - x[i2][0];
+    delyBD = x[i4][1] - x[i2][1];
+    delzBD = x[i4][2] - x[i2][2];
+
+    // bond lengths
+
+    rABmag2 = delxAB*delxAB + delyAB*delyAB + delzAB*delzAB;
+    rAB = sqrt(rABmag2);
+    rBCmag2 = delxBC*delxBC + delyBC*delyBC + delzBC*delzBC;
+    rBC = sqrt(rBCmag2);
+    rBDmag2 = delxBD*delxBD + delyBD*delyBD + delzBD*delzBD;
+    rBD = sqrt(rBDmag2);
+
+    // angle ABC, ABD, CBD
+
+    costhABC = (delxAB*delxBC + delyAB*delyBC + delzAB*delzBC) / (rAB * rBC);
+    if (costhABC > 1.0)  costhABC = 1.0;
+    if (costhABC < -1.0) costhABC = -1.0;
+    thetaABC = acos(costhABC);
+
+    costhABD = (delxAB*delxBD + delyAB*delyBD + delzAB*delzBD) / (rAB * rBD);
+    if (costhABD > 1.0)  costhABD = 1.0;
+    if (costhABD < -1.0) costhABD = -1.0;
+    thetaABD = acos(costhABD);
+
+    costhCBD = (delxBC*delxBD + delyBC*delyBD + delzBC*delzBD) /(rBC * rBD);
+    if (costhCBD > 1.0)  costhCBD = 1.0;
+    if (costhCBD < -1.0) costhCBD = -1.0;
+    thetaCBD = acos(costhCBD);
+
+    dthABC = thetaABC - aa_theta0_1[type];
+    dthABD = thetaABD - aa_theta0_2[type];
+    dthCBD = thetaCBD - aa_theta0_3[type];
+
+    // energy
+
+    if (EFLAG) eimproper = aa_k2[type] * dthABC * dthABD +
+                 aa_k1[type] * dthABC * dthCBD +
+                 aa_k3[type] * dthABD * dthCBD;
+
+    // d(theta)/d(r) array
+    // angle i, atom j, coordinate k
+
+    for (i = 0; i < 3; i++)
+      for (j = 0; j < 4; j++)
+        for (k = 0; k < 3; k++)
+          dthetadr[i][j][k] = 0.0;
+
+    // angle ABC
+
+    sc1 = sqrt(1.0/(1.0 - costhABC*costhABC));
+    t1 = costhABC / rABmag2;
+    t3 = costhABC / rBCmag2;
+    r12 = 1.0 / (rAB * rBC);
+
+    dthetadr[0][0][0] = sc1 * ((t1 * delxAB) - (delxBC * r12));
+    dthetadr[0][0][1] = sc1 * ((t1 * delyAB) - (delyBC * r12));
+    dthetadr[0][0][2] = sc1 * ((t1 * delzAB) - (delzBC * r12));
+    dthetadr[0][1][0] = sc1 * ((-t1 * delxAB) + (delxBC * r12) +
+                               (-t3 * delxBC) + (delxAB * r12));
+    dthetadr[0][1][1] = sc1 * ((-t1 * delyAB) + (delyBC * r12) +
+                               (-t3 * delyBC) + (delyAB * r12));
+    dthetadr[0][1][2] = sc1 * ((-t1 * delzAB) + (delzBC * r12) +
+                               (-t3 * delzBC) + (delzAB * r12));
+    dthetadr[0][2][0] = sc1 * ((t3 * delxBC) - (delxAB * r12));
+    dthetadr[0][2][1] = sc1 * ((t3 * delyBC) - (delyAB * r12));
+    dthetadr[0][2][2] = sc1 * ((t3 * delzBC) - (delzAB * r12));
+
+    // angle CBD
+
+    sc1 = sqrt(1.0/(1.0 - costhCBD*costhCBD));
+    t1 = costhCBD / rBCmag2;
+    t3 = costhCBD / rBDmag2;
+    r12 = 1.0 / (rBC * rBD);
+
+    dthetadr[1][2][0] = sc1 * ((t1 * delxBC) - (delxBD * r12));
+    dthetadr[1][2][1] = sc1 * ((t1 * delyBC) - (delyBD * r12));
+    dthetadr[1][2][2] = sc1 * ((t1 * delzBC) - (delzBD * r12));
+    dthetadr[1][1][0] = sc1 * ((-t1 * delxBC) + (delxBD * r12) +
+                               (-t3 * delxBD) + (delxBC * r12));
+    dthetadr[1][1][1] = sc1 * ((-t1 * delyBC) + (delyBD * r12) +
+                               (-t3 * delyBD) + (delyBC * r12));
+    dthetadr[1][1][2] = sc1 * ((-t1 * delzBC) + (delzBD * r12) +
+                               (-t3 * delzBD) + (delzBC * r12));
+    dthetadr[1][3][0] = sc1 * ((t3 * delxBD) - (delxBC * r12));
+    dthetadr[1][3][1] = sc1 * ((t3 * delyBD) - (delyBC * r12));
+    dthetadr[1][3][2] = sc1 * ((t3 * delzBD) - (delzBC * r12));
+
+    // angle ABD
+
+    sc1 = sqrt(1.0/(1.0 - costhABD*costhABD));
+    t1 = costhABD / rABmag2;
+    t3 = costhABD / rBDmag2;
+    r12 = 1.0 / (rAB * rBD);
+
+    dthetadr[2][0][0] = sc1 * ((t1 * delxAB) - (delxBD * r12));
+    dthetadr[2][0][1] = sc1 * ((t1 * delyAB) - (delyBD * r12));
+    dthetadr[2][0][2] = sc1 * ((t1 * delzAB) - (delzBD * r12));
+    dthetadr[2][1][0] = sc1 * ((-t1 * delxAB) + (delxBD * r12) +
+                               (-t3 * delxBD) + (delxAB * r12));
+    dthetadr[2][1][1] = sc1 * ((-t1 * delyAB) + (delyBD * r12) +
+                               (-t3 * delyBD) + (delyAB * r12));
+    dthetadr[2][1][2] = sc1 * ((-t1 * delzAB) + (delzBD * r12) +
+                               (-t3 * delzBD) + (delzAB * r12));
+    dthetadr[2][3][0] = sc1 * ((t3 * delxBD) - (delxAB * r12));
+    dthetadr[2][3][1] = sc1 * ((t3 * delyBD) - (delyAB * r12));
+    dthetadr[2][3][2] = sc1 * ((t3 * delzBD) - (delzAB * r12));
+
+    // angleangle forces
+
+    for (i = 0; i < 4; i++)
+      for (j = 0; j < 3; j++)
+        fabcd[i][j] = -
+          ((aa_k1[type] *
+            (dthABC*dthetadr[1][i][j] + dthCBD*dthetadr[0][i][j])) +
+           (aa_k2[type] *
+            (dthABC*dthetadr[2][i][j] + dthABD*dthetadr[0][i][j])) +
+           (aa_k3[type] *
+            (dthABD*dthetadr[1][i][j] + dthCBD*dthetadr[2][i][j])));
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += fabcd[0][0];
+      f[i1][1] += fabcd[0][1];
+      f[i1][2] += fabcd[0][2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += fabcd[1][0];
+      f[i2][1] += fabcd[1][1];
+      f[i2][2] += fabcd[1][2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += fabcd[2][0];
+      f[i3][1] += fabcd[2][1];
+      f[i3][2] += fabcd[2][2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += fabcd[3][0];
+      f[i4][1] += fabcd[3][1];
+      f[i4][2] += fabcd[3][2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,
+                   fabcd[0],fabcd[2],fabcd[3],delxAB,delyAB,delzAB,
+                   delxBC,delyBC,delzBC,delxBD,delyBD,delzBD,thr);
+  }
+}
diff --git a/src/USER-OMP/improper_class2_omp.h b/src/USER-OMP/improper_class2_omp.h
new file mode 100644
index 0000000000..6f668a0078
--- /dev/null
+++ b/src/USER-OMP/improper_class2_omp.h
@@ -0,0 +1,50 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(class2/omp,ImproperClass2OMP)
+
+#else
+
+#ifndef LMP_IMPROPER_CLASS2_OMP_H
+#define LMP_IMPROPER_CLASS2_OMP_H
+
+#include "improper_class2.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class ImproperClass2OMP : public ImproperClass2, public ThrOMP {
+
+ public:
+  ImproperClass2OMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void angleangle_thr(int, int, ThrData * const thr);
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/improper_cossq_omp.cpp b/src/USER-OMP/improper_cossq_omp.cpp
new file mode 100644
index 0000000000..c49cd0b3ce
--- /dev/null
+++ b/src/USER-OMP/improper_cossq_omp.cpp
@@ -0,0 +1,267 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "improper_cossq_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+ImproperCossqOMP::ImproperCossqOMP(class LAMMPS *lmp)
+  : ImproperCossq(lmp), ThrOMP(lmp,THR_IMPROPER)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperCossqOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nimproperlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperCossqOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z;
+  double eimproper,f1[3],f2[3],f3[3],f4[3];
+  double rjisq, rji, rlksq, rlk, cosphi, angfac;
+  double cjiji, clkji, clklk, cfact1, cfact2, cfact3;
+
+  eimproper = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const improperlist = neighbor->improperlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = improperlist[n][0];
+    i2 = improperlist[n][1];
+    i3 = improperlist[n][2];
+    i4 = improperlist[n][3];
+    type = improperlist[n][4];
+
+    /* separation vector between i1 and i2, (i2-i1) */
+    vb1x = x[i2][0] - x[i1][0];
+    vb1y = x[i2][1] - x[i1][1];
+    vb1z = x[i2][2] - x[i1][2];
+    rjisq = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z ;
+    rji = sqrt(rjisq);
+
+    /* separation vector between i2 and i3 (i3-i2) */
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    /* separation vector between i3 and i4, (i4-i3) */
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+    rlksq = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z ;
+    rlk = sqrt(rlksq);
+
+    cosphi = (vb3x*vb1x + vb3y*vb1y + vb3z*vb1z)/(rji * rlk);
+
+    /* Check that cos(phi) is in the correct limits. */
+    if (cosphi > 1.0 + TOLERANCE || cosphi < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Improper problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+
+
+      /* Apply corrections to round-off errors. */
+      if (cosphi > 1.0)  cosphi -= SMALL;
+      if (cosphi < -1.0) cosphi += SMALL;
+
+      /* Calculate the angle: */
+      double torangle = acos(cosphi);
+      cosphi = cos(torangle - chi[type]);
+
+      if (EFLAG) eimproper = 0.5 * k[type] * cosphi * cosphi;
+
+      /*
+        printf("The tags: %d-%d-%d-%d, of type %d .\n",atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4],type);
+        printf("The ji vector: %f, %f, %f.\nThe lk vector: %f, %f, %f.\n", vb1x,vb1y,vb1z,vb3x,vb3y,vb3z);
+        printf("The cosine of the angle: %-1.16e.\n", cosphi);
+        printf("The energy of the improper: %-1.16e with prefactor %-1.16e.\n", eimproper, 0.5*k[type]);
+      */
+
+      /* Work out forces. */
+      angfac = - k[type] * cosphi;
+
+      cjiji = rjisq;
+      clklk = rlksq;
+      /*CLKJI = RXLK * RXJI + RYLK * RYJI + RZLK * RZJI */
+      clkji = vb3x*vb1x + vb3y*vb1y + vb3z*vb1z;
+
+      /*CFACT1 = CLKLK * CJIJI
+        CFACT1 = SQRT(CFACT1)
+        CFACT1 = ANGFAC / CFACT1*/
+      cfact1 = angfac / sqrt(clklk * cjiji);
+      /*CFACT2 = CLKJI / CLKLK*/
+      cfact2 = clkji / clklk;
+      /*CFACT3 = CLKJI / CJIJI*/
+      cfact3 = clkji / cjiji;
+
+      /*FIX = -RXLK + CFACT3 * RXJI
+        FIY = -RYLK + CFACT3 * RYJI
+        FIZ = -RZLK + CFACT3 * RZJI*/
+      f1[0] = - vb3x + cfact3 * vb1x;
+      f1[1] = - vb3y + cfact3 * vb1y;
+      f1[2] = - vb3z + cfact3 * vb1z;
+
+      /*FJX = -FIX
+        FJY = -FIY
+        FJZ = -FIZ*/
+      f2[0] = - f1[0];
+      f2[1] = - f1[1];
+      f2[2] = - f1[2];
+
+      /*FKX = CFACT2 * RXLK - RXJI
+        FKY = CFACT2 * RYLK - RYJI
+        FKZ = CFACT2 * RZLK - RZJI*/
+      f3[0] = cfact2 * vb3x - vb1x;
+      f3[1] = cfact2 * vb3y - vb1y;
+      f3[2] = cfact2 * vb3z - vb1z;
+
+      /*FLX = -FKX
+        FLY = -FKY
+        FLZ = -FKZ*/
+      f4[0] = - f3[0];
+      f4[1] = - f3[1];
+      f4[2] = - f3[2];
+
+      /*FIX = FIX * CFACT1
+        FIY = FIY * CFACT1
+        FIZ = FIZ * CFACT1*/
+      f1[0] *= cfact1;
+      f1[1] *= cfact1;
+      f1[2] *= cfact1;
+
+      /*FJX = FJX * CFACT1
+        FJY = FJY * CFACT1
+        FJZ = FJZ * CFACT1*/
+      f2[0] *= cfact1;
+      f2[1] *= cfact1;
+      f2[2] *= cfact1;
+
+      /*FKX = FKX * CFACT1
+        FKY = FKY * CFACT1
+        FKZ = FKZ * CFACT1*/
+      f3[0] *= cfact1;
+      f3[1] *= cfact1;
+      f3[2] *= cfact1;
+
+      /*FLX = FLX * CFACT1
+        FLY = FLY * CFACT1
+        FLZ = FLZ * CFACT1*/
+      f4[0] *= cfact1;
+      f4[1] *= cfact1;
+      f4[2] *= cfact1;
+
+      /* Apply force to each of 4 atoms */
+      if (NEWTON_BOND || i1 < nlocal) {
+        f[i1][0] += f1[0];
+        f[i1][1] += f1[1];
+        f[i1][2] += f1[2];
+      }
+
+      if (NEWTON_BOND || i2 < nlocal) {
+        f[i2][0] += f2[0];
+        f[i2][1] += f2[1];
+        f[i2][2] += f2[2];
+      }
+
+      if (NEWTON_BOND || i3 < nlocal) {
+        f[i3][0] += f3[0];
+        f[i3][1] += f3[1];
+        f[i3][2] += f3[2];
+      }
+
+      if (NEWTON_BOND || i4 < nlocal) {
+        f[i4][0] += f4[0];
+        f[i4][1] += f4[1];
+        f[i4][2] += f4[2];
+      }
+
+      if (EVFLAG)
+        ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,f1,f3,f4,
+                     -vb1x,-vb1y,-vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+    }
+  }
+}
diff --git a/src/USER-OMP/improper_cossq_omp.h b/src/USER-OMP/improper_cossq_omp.h
new file mode 100644
index 0000000000..8fd1bc09a0
--- /dev/null
+++ b/src/USER-OMP/improper_cossq_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(cossq/omp,ImproperCossqOMP)
+
+#else
+
+#ifndef LMP_IMPROPER_COSSQ_OMP_H
+#define LMP_IMPROPER_COSSQ_OMP_H
+
+#include "improper_cossq.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class ImproperCossqOMP : public ImproperCossq, public ThrOMP {
+
+ public:
+  ImproperCossqOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/improper_cvff_omp.cpp b/src/USER-OMP/improper_cvff_omp.cpp
new file mode 100644
index 0000000000..819a22b66e
--- /dev/null
+++ b/src/USER-OMP/improper_cvff_omp.cpp
@@ -0,0 +1,298 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "improper_cvff_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+ImproperCvffOMP::ImproperCvffOMP(class LAMMPS *lmp)
+  : ImproperCvff(lmp), ThrOMP(lmp,THR_IMPROPER)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperCvffOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nimproperlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperCvffOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,m,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,vb2xm,vb2ym,vb2zm;
+  double eimproper,f1[3],f2[3],f3[3],f4[3];
+  double sb1,sb2,sb3,rb1,rb3,c0,b1mag2,b1mag,b2mag2;
+  double b2mag,b3mag2,b3mag,ctmp,r12c1,c1mag,r12c2;
+  double c2mag,sc1,sc2,s1,s2,s12,c,p,pd,rc2,a,a11,a22;
+  double a33,a12,a13,a23,sx2,sy2,sz2;
+
+  eimproper = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const improperlist = neighbor->improperlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = improperlist[n][0];
+    i2 = improperlist[n][1];
+    i3 = improperlist[n][2];
+    i4 = improperlist[n][3];
+    type = improperlist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb2xm = -vb2x;
+    vb2ym = -vb2y;
+    vb2zm = -vb2z;
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    // c0 calculation
+
+    sb1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+    sb2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+    sb3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+    rb1 = sqrt(sb1);
+    rb3 = sqrt(sb3);
+
+    c0 = (vb1x*vb3x + vb1y*vb3y + vb1z*vb3z) * rb1*rb3;
+
+    // 1st and 2nd angle
+
+    b1mag2 = vb1x*vb1x + vb1y*vb1y + vb1z*vb1z;
+    b1mag = sqrt(b1mag2);
+    b2mag2 = vb2x*vb2x + vb2y*vb2y + vb2z*vb2z;
+    b2mag = sqrt(b2mag2);
+    b3mag2 = vb3x*vb3x + vb3y*vb3y + vb3z*vb3z;
+    b3mag = sqrt(b3mag2);
+
+    ctmp = vb1x*vb2x + vb1y*vb2y + vb1z*vb2z;
+    r12c1 = 1.0 / (b1mag*b2mag);
+    c1mag = ctmp * r12c1;
+
+    ctmp = vb2xm*vb3x + vb2ym*vb3y + vb2zm*vb3z;
+    r12c2 = 1.0 / (b2mag*b3mag);
+    c2mag = ctmp * r12c2;
+
+    // cos and sin of 2 angles and final c
+
+    sc1 = sqrt(1.0 - c1mag*c1mag);
+    if (sc1 < SMALL) sc1 = SMALL;
+    sc1 = 1.0/sc1;
+
+    sc2 = sqrt(1.0 - c2mag*c2mag);
+    if (sc2 < SMALL) sc2 = SMALL;
+    sc2 = 1.0/sc2;
+
+    s1 = sc1 * sc1;
+    s2 = sc2 * sc2;
+    s12 = sc1 * sc2;
+    c = (c0 + c1mag*c2mag) * s12;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Improper problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    // force & energy
+    // p = 1 + cos(n*phi) for d = 1
+    // p = 1 - cos(n*phi) for d = -1
+    // pd = dp/dc / 2
+
+    m = multiplicity[type];
+
+    if (m == 2) {
+      p = 2.0*c*c;
+      pd = 2.0*c;
+    } else if (m == 3) {
+      rc2 = c*c;
+      p = (4.0*rc2-3.0)*c + 1.0;
+      pd = 6.0*rc2 - 1.5;
+    } else if (m == 4) {
+      rc2 = c*c;
+      p = 8.0*(rc2-1)*rc2 + 2.0;
+      pd = (16.0*rc2-8.0)*c;
+    } else if (m == 6) {
+      rc2 = c*c;
+      p = ((32.0*rc2-48.0)*rc2 + 18.0)*rc2;
+      pd = (96.0*(rc2-1.0)*rc2 + 18.0)*c;
+    } else if (m == 1) {
+      p = c + 1.0;
+      pd = 0.5;
+    } else if (m == 5) {
+      rc2 = c*c;
+      p = ((16.0*rc2-20.0)*rc2 + 5.0)*c + 1.0;
+      pd = (40.0*rc2-30.0)*rc2 + 2.5;
+    } else if (m == 0) {
+      p = 2.0;
+      pd = 0.0;
+    }
+
+    if (sign[type] == -1) {
+      p = 2.0 - p;
+      pd = -pd;
+    }
+
+    if (EFLAG) eimproper = k[type]*p;
+
+    a = 2.0 * k[type] * pd;
+    c = c * a;
+    s12 = s12 * a;
+    a11 = c*sb1*s1;
+    a22 = -sb2*(2.0*c0*s12 - c*(s1+s2));
+    a33 = c*sb3*s2;
+    a12 = -r12c1*(c1mag*c*s1 + c2mag*s12);
+    a13 = -rb1*rb3*s12;
+    a23 = r12c2*(c2mag*c*s2 + c1mag*s12);
+
+    sx2  = a12*vb1x + a22*vb2x + a23*vb3x;
+    sy2  = a12*vb1y + a22*vb2y + a23*vb3y;
+    sz2  = a12*vb1z + a22*vb2z + a23*vb3z;
+
+    f1[0] = a11*vb1x + a12*vb2x + a13*vb3x;
+    f1[1] = a11*vb1y + a12*vb2y + a13*vb3y;
+    f1[2] = a11*vb1z + a12*vb2z + a13*vb3z;
+
+    f2[0] = -sx2 - f1[0];
+    f2[1] = -sy2 - f1[1];
+    f2[2] = -sz2 - f1[2];
+
+    f4[0] = a13*vb1x + a23*vb2x + a33*vb3x;
+    f4[1] = a13*vb1y + a23*vb2y + a33*vb3y;
+    f4[2] = a13*vb1z + a23*vb2z + a33*vb3z;
+
+    f3[0] = sx2 - f4[0];
+    f3[1] = sy2 - f4[1];
+    f3[2] = sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/improper_cvff_omp.h b/src/USER-OMP/improper_cvff_omp.h
new file mode 100644
index 0000000000..8ab8f1615c
--- /dev/null
+++ b/src/USER-OMP/improper_cvff_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(cvff/omp,ImproperCvffOMP)
+
+#else
+
+#ifndef LMP_IMPROPER_CVFF_OMP_H
+#define LMP_IMPROPER_CVFF_OMP_H
+
+#include "improper_cvff.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class ImproperCvffOMP : public ImproperCvff, public ThrOMP {
+
+ public:
+  ImproperCvffOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/improper_harmonic_omp.cpp b/src/USER-OMP/improper_harmonic_omp.cpp
new file mode 100644
index 0000000000..4ac8351a80
--- /dev/null
+++ b/src/USER-OMP/improper_harmonic_omp.cpp
@@ -0,0 +1,240 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "improper_harmonic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+ImproperHarmonicOMP::ImproperHarmonicOMP(class LAMMPS *lmp)
+  : ImproperHarmonic(lmp), ThrOMP(lmp,THR_IMPROPER)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperHarmonicOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nimproperlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperHarmonicOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,n,type;
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z;
+  double eimproper,f1[3],f2[3],f3[3],f4[3];
+  double ss1,ss2,ss3,r1,r2,r3,c0,c1,c2,s1,s2;
+  double s12,c,s,domega,a,a11,a22,a33,a12,a13,a23;
+  double sx2,sy2,sz2;
+
+  eimproper = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const improperlist = neighbor->improperlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = improperlist[n][0];
+    i2 = improperlist[n][1];
+    i3 = improperlist[n][2];
+    i4 = improperlist[n][3];
+    type = improperlist[n][4];
+
+    // geometry of 4-body
+
+    vb1x = x[i1][0] - x[i2][0];
+    vb1y = x[i1][1] - x[i2][1];
+    vb1z = x[i1][2] - x[i2][2];
+
+    vb2x = x[i3][0] - x[i2][0];
+    vb2y = x[i3][1] - x[i2][1];
+    vb2z = x[i3][2] - x[i2][2];
+
+    vb3x = x[i4][0] - x[i3][0];
+    vb3y = x[i4][1] - x[i3][1];
+    vb3z = x[i4][2] - x[i3][2];
+
+    ss1 = 1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+    ss2 = 1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+    ss3 = 1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+    r1 = sqrt(ss1);
+    r2 = sqrt(ss2);
+    r3 = sqrt(ss3);
+
+    // sin and cos of angle
+
+    c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * r1 * r3;
+    c1 = (vb1x * vb2x + vb1y * vb2y + vb1z * vb2z) * r1 * r2;
+    c2 = -(vb3x * vb2x + vb3y * vb2y + vb3z * vb2z) * r3 * r2;
+
+    s1 = 1.0 - c1*c1;
+    if (s1 < SMALL) s1 = SMALL;
+    s1 = 1.0 / s1;
+
+    s2 = 1.0 - c2*c2;
+    if (s2 < SMALL) s2 = SMALL;
+    s2 = 1.0 / s2;
+
+    s12 = sqrt(s1*s2);
+    c = (c1*c2 + c0) * s12;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Improper problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) c = 1.0;
+    if (c < -1.0) c = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+
+    // force & energy
+
+    domega = acos(c) - chi[type];
+    a = k[type] * domega;
+
+    if (EFLAG) eimproper = a*domega;
+
+    a = -a * 2.0/s;
+    c = c * a;
+    s12 = s12 * a;
+    a11 = c*ss1*s1;
+    a22 = -ss2 * (2.0*c0*s12 - c*(s1+s2));
+    a33 = c*ss3*s2;
+    a12 = -r1*r2*(c1*c*s1 + c2*s12);
+    a13 = -r1*r3*s12;
+    a23 = r2*r3*(c2*c*s2 + c1*s12);
+
+    sx2  = a22*vb2x + a23*vb3x + a12*vb1x;
+    sy2  = a22*vb2y + a23*vb3y + a12*vb1y;
+    sz2  = a22*vb2z + a23*vb3z + a12*vb1z;
+
+    f1[0] = a12*vb2x + a13*vb3x + a11*vb1x;
+    f1[1] = a12*vb2y + a13*vb3y + a11*vb1y;
+    f1[2] = a12*vb2z + a13*vb3z + a11*vb1z;
+
+    f2[0] = -sx2 - f1[0];
+    f2[1] = -sy2 - f1[1];
+    f2[2] = -sz2 - f1[2];
+
+    f4[0] = a23*vb2x + a33*vb3x + a13*vb1x;
+    f4[1] = a23*vb2y + a33*vb3y + a13*vb1y;
+    f4[2] = a23*vb2z + a33*vb3z + a13*vb1z;
+
+    f3[0] = sx2 - f4[0];
+    f3[1] = sy2 - f4[1];
+    f3[2] = sz2 - f4[2];
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0];
+      f[i1][1] += f1[1];
+      f[i1][2] += f1[2];
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f2[0];
+      f[i2][1] += f2[1];
+      f[i2][2] += f2[2];
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f3[0];
+      f[i3][1] += f3[1];
+      f[i3][2] += f3[2];
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0];
+      f[i4][1] += f4[1];
+      f[i4][2] += f4[2];
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/improper_harmonic_omp.h b/src/USER-OMP/improper_harmonic_omp.h
new file mode 100644
index 0000000000..d6325016a7
--- /dev/null
+++ b/src/USER-OMP/improper_harmonic_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(harmonic/omp,ImproperHarmonicOMP)
+
+#else
+
+#ifndef LMP_IMPROPER_HARMONIC_OMP_H
+#define LMP_IMPROPER_HARMONIC_OMP_H
+
+#include "improper_harmonic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class ImproperHarmonicOMP : public ImproperHarmonic, public ThrOMP {
+
+ public:
+  ImproperHarmonicOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/improper_ring_omp.cpp b/src/USER-OMP/improper_ring_omp.cpp
new file mode 100644
index 0000000000..78b15a0453
--- /dev/null
+++ b/src/USER-OMP/improper_ring_omp.cpp
@@ -0,0 +1,261 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "improper_ring_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+ImproperRingOMP::ImproperRingOMP(class LAMMPS *lmp)
+  : ImproperRing(lmp), ThrOMP(lmp,THR_IMPROPER)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperRingOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nimproperlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperRingOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  /* Be careful!: "chi" is the equilibrium angle in radians. */
+  int i1,i2,i3,i4,n,type;
+
+  double eimproper;
+
+  /* Compatibility variables. */
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z;
+  double f1[3], f3[3], f4[3];
+
+  /* Actual computation variables. */
+  int at1[3], at2[3], at3[3], icomb;
+  double   bvec1x[3], bvec1y[3], bvec1z[3],
+    bvec2x[3], bvec2y[3], bvec2z[3],
+    bvec1n[3], bvec2n[3], bend_angle[3];
+  double   angle_summer, angfac, cfact1, cfact2, cfact3;
+  double   cjiji, ckjji, ckjkj, fix, fiy, fiz, fjx, fjy, fjz, fkx, fky, fkz;
+
+  eimproper = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const improperlist = neighbor->improperlist;
+  const int nlocal = atom->nlocal;
+
+  /* A description of the potential can be found in
+     Macromolecules 35, pp. 1463-1472 (2002). */
+  for (n = nfrom; n < nto; n++) {
+    /* Take the ids of the atoms contributing to the improper potential. */
+    i1 = improperlist[n][0];   /* Atom "1" of Figure 1 from the above reference.*/
+    i2 = improperlist[n][1];   /* Atom "2" ... */
+    i3 = improperlist[n][2];   /* Atom "3" ... */
+    i4 = improperlist[n][3];   /* Atom "9" ... */
+    type = improperlist[n][4];
+
+    /* Calculate the necessary variables for LAMMPS implementation.
+       if (evflag) ev_tally(i1,i2,i3,i4,nlocal,newton_bond,eimproper,f1,f3,f4,
+       vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z);
+       Although, they are irrelevant to the calculation of the potential, we keep
+       them for maximal compatibility. */
+    vb1x = x[i1][0] - x[i2][0]; vb1y = x[i1][1] - x[i2][1]; vb1z = x[i1][2] - x[i2][2];
+
+    vb2x = x[i3][0] - x[i2][0]; vb2y = x[i3][1] - x[i2][1]; vb2z = x[i3][2] - x[i2][2];
+
+    vb3x = x[i4][0] - x[i3][0]; vb3y = x[i4][1] - x[i3][1]; vb3z = x[i4][2] - x[i3][2];
+
+
+    /* Pass the atom tags to form the necessary combinations. */
+    at1[0] = i1; at2[0] = i2; at3[0] = i4;  /* ids: 1-2-9 */
+    at1[1] = i1; at2[1] = i2; at3[1] = i3;  /* ids: 1-2-3 */
+    at1[2] = i4; at2[2] = i2; at3[2] = i3;  /* ids: 9-2-3 */
+
+
+    /* Initialize the sum of the angles differences. */
+    angle_summer = 0.0;
+    /* Take a loop over the three angles, defined by each triad: */
+    for (icomb = 0; icomb < 3; icomb ++) {
+
+      /* Bond vector connecting the first and the second atom. */
+      bvec1x[icomb] = x[at2[icomb]][0] - x[at1[icomb]][0];
+      bvec1y[icomb] = x[at2[icomb]][1] - x[at1[icomb]][1];
+      bvec1z[icomb] = x[at2[icomb]][2] - x[at1[icomb]][2];
+      /* also calculate the norm of the vector: */
+      bvec1n[icomb] = sqrt(  bvec1x[icomb]*bvec1x[icomb]
+                             + bvec1y[icomb]*bvec1y[icomb]
+                             + bvec1z[icomb]*bvec1z[icomb]);
+      /* Bond vector connecting the second and the third atom. */
+      bvec2x[icomb] = x[at3[icomb]][0] - x[at2[icomb]][0];
+      bvec2y[icomb] = x[at3[icomb]][1] - x[at2[icomb]][1];
+      bvec2z[icomb] = x[at3[icomb]][2] - x[at2[icomb]][2];
+      /* also calculate the norm of the vector: */
+      bvec2n[icomb] = sqrt(  bvec2x[icomb]*bvec2x[icomb]
+                             + bvec2y[icomb]*bvec2y[icomb]
+                             + bvec2z[icomb]*bvec2z[icomb]);
+
+      /* Calculate the bending angle of the atom triad: */
+      bend_angle[icomb] = (  bvec2x[icomb]*bvec1x[icomb]
+                             + bvec2y[icomb]*bvec1y[icomb]
+                             + bvec2z[icomb]*bvec1z[icomb]);
+      bend_angle[icomb] /= (bvec1n[icomb] * bvec2n[icomb]);
+      if (bend_angle[icomb] >  1.0) bend_angle[icomb] -= SMALL;
+      if (bend_angle[icomb] < -1.0) bend_angle[icomb] += SMALL;
+
+      /* Append the current angle to the sum of angle differences. */
+      angle_summer += (bend_angle[icomb] - chi[type]);
+    }
+    if (EFLAG) eimproper = (1.0/6.0) *k[type] * pow(angle_summer,6.0);
+    /*
+      printf("The tags: %d-%d-%d-%d, of type %d .\n",atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4],type);
+      // printf("The coordinates of the first: %f, %f, %f.\n", x[i1][0], x[i1][1], x[i1][2]);
+      // printf("The coordinates of the second: %f, %f, %f.\n", x[i2][0], x[i2][1], x[i2][2]);
+      // printf("The coordinates of the third: %f, %f, %f.\n", x[i3][0], x[i3][1], x[i3][2]);
+      // printf("The coordinates of the fourth: %f, %f, %f.\n", x[i4][0], x[i4][1], x[i4][2]);
+      printf("The angles are: %f / %f / %f equilibrium: %f.\n", bend_angle[0], bend_angle[1], bend_angle[2],chi[type]);
+      printf("The energy of the improper: %f with prefactor %f.\n", eimproper,(1.0/6.0)*k[type]);
+      printf("The sum of the angles: %f.\n", angle_summer);
+    */
+
+    /* Force calculation acting on all atoms.
+       Calculate the derivatives of the potential. */
+    angfac = k[type] * pow(angle_summer,5.0);
+
+    f1[0] = 0.0; f1[1] = 0.0; f1[2] = 0.0;
+    f3[0] = 0.0; f3[1] = 0.0; f3[2] = 0.0;
+    f4[0] = 0.0; f4[1] = 0.0; f4[2] = 0.0;
+
+    /* Take a loop over the three angles, defined by each triad: */
+    for (icomb = 0; icomb < 3; icomb ++)
+      {
+        /* Calculate the squares of the distances. */
+        cjiji = bvec1n[icomb] * bvec1n[icomb];  ckjkj = bvec2n[icomb] * bvec2n[icomb];
+
+        ckjji =   bvec2x[icomb] * bvec1x[icomb]
+          + bvec2y[icomb] * bvec1y[icomb]
+          + bvec2z[icomb] * bvec1z[icomb] ;
+
+        cfact1 = angfac / (sqrt(ckjkj * cjiji));
+        cfact2 = ckjji / ckjkj;
+        cfact3 = ckjji / cjiji;
+
+        /* Calculate the force acted on the thrid atom of the angle. */
+        fkx = cfact2 * bvec2x[icomb] - bvec1x[icomb];
+        fky = cfact2 * bvec2y[icomb] - bvec1y[icomb];
+        fkz = cfact2 * bvec2z[icomb] - bvec1z[icomb];
+
+        /* Calculate the force acted on the first atom of the angle. */
+        fix = bvec2x[icomb] - cfact3 * bvec1x[icomb];
+        fiy = bvec2y[icomb] - cfact3 * bvec1y[icomb];
+        fiz = bvec2z[icomb] - cfact3 * bvec1z[icomb];
+
+        /* Finally, calculate the force acted on the middle atom of the angle.*/
+        fjx = - fix - fkx;  fjy = - fiy - fky;  fjz = - fiz - fkz;
+
+        /* Consider the appropriate scaling of the forces: */
+        fix *= cfact1; fiy *= cfact1; fiz *= cfact1;
+        fjx *= cfact1; fjy *= cfact1; fjz *= cfact1;
+        fkx *= cfact1; fky *= cfact1; fkz *= cfact1;
+
+        if      (at1[icomb] == i1)  {f1[0] += fix; f1[1] += fiy; f1[2] += fiz;}
+        else if (at2[icomb] == i1)  {f1[0] += fjx; f1[1] += fjy; f1[2] += fjz;}
+        else if (at3[icomb] == i1)  {f1[0] += fkx; f1[1] += fky; f1[2] += fkz;}
+
+        if      (at1[icomb] == i3)  {f3[0] += fix; f3[1] += fiy; f3[2] += fiz;}
+        else if (at2[icomb] == i3)  {f3[0] += fjx; f3[1] += fjy; f3[2] += fjz;}
+        else if (at3[icomb] == i3)  {f3[0] += fkx; f3[1] += fky; f3[2] += fkz;}
+
+        if      (at1[icomb] == i4)  {f4[0] += fix; f4[1] += fiy; f4[2] += fiz;}
+        else if (at2[icomb] == i4)  {f4[0] += fjx; f4[1] += fjy; f4[2] += fjz;}
+        else if (at3[icomb] == i4)  {f4[0] += fkx; f4[1] += fky; f4[2] += fkz;}
+
+
+        /* Store the contribution to the global arrays: */
+        /* Take the id of the atom from the at1[icomb] element, i1 = at1[icomb]. */
+        if (NEWTON_BOND || at1[icomb] < nlocal) {
+          f[at1[icomb]][0] += fix;
+          f[at1[icomb]][1] += fiy;
+          f[at1[icomb]][2] += fiz;
+        }
+        /* Take the id of the atom from the at2[icomb] element, i2 = at2[icomb]. */
+        if (NEWTON_BOND || at2[icomb] < nlocal) {
+          f[at2[icomb]][0] += fjx;
+          f[at2[icomb]][1] += fjy;
+          f[at2[icomb]][2] += fjz;
+        }
+        /* Take the id of the atom from the at3[icomb] element, i3 = at3[icomb]. */
+        if (NEWTON_BOND || at3[icomb] < nlocal) {
+          f[at3[icomb]][0] += fkx;
+          f[at3[icomb]][1] += fky;
+          f[at3[icomb]][2] += fkz;
+        }
+
+      }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/improper_ring_omp.h b/src/USER-OMP/improper_ring_omp.h
new file mode 100644
index 0000000000..2b2616ab03
--- /dev/null
+++ b/src/USER-OMP/improper_ring_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(ring/omp,ImproperRingOMP)
+
+#else
+
+#ifndef LMP_IMPROPER_RING_OMP_H
+#define LMP_IMPROPER_RING_OMP_H
+
+#include "improper_ring.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class ImproperRingOMP : public ImproperRing, public ThrOMP {
+
+ public:
+  ImproperRingOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/improper_umbrella_omp.cpp b/src/USER-OMP/improper_umbrella_omp.cpp
new file mode 100644
index 0000000000..9a64da1fb1
--- /dev/null
+++ b/src/USER-OMP/improper_umbrella_omp.cpp
@@ -0,0 +1,256 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "improper_umbrella_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOLERANCE 0.05
+#define SMALL     0.001
+
+/* ---------------------------------------------------------------------- */
+
+ImproperUmbrellaOMP::ImproperUmbrellaOMP(class LAMMPS *lmp)
+  : ImproperUmbrella(lmp), ThrOMP(lmp,THR_IMPROPER)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperUmbrellaOMP::compute(int eflag, int vflag)
+{
+
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = neighbor->nimproperlist;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_bond) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_bond) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_bond) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+void ImproperUmbrellaOMP::eval(int nfrom, int nto, ThrData * const thr)
+{
+  int i1,i2,i3,i4,n,type;
+  double eimproper,f1[3],f2[3],f3[3],f4[3];
+  double vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z;
+  double domega,c,a,s,projhfg,dhax,dhay,dhaz,dahx,dahy,dahz,cotphi;
+  double ax,ay,az,ra2,rh2,ra,rh,rar,rhr,arx,ary,arz,hrx,hry,hrz;
+
+  eimproper = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const * const improperlist = neighbor->improperlist;
+  const int nlocal = atom->nlocal;
+
+  for (n = nfrom; n < nto; n++) {
+    i1 = improperlist[n][0];
+    i2 = improperlist[n][1];
+    i3 = improperlist[n][2];
+    i4 = improperlist[n][3];
+    type = improperlist[n][4];
+
+    // 1st bond
+
+    vb1x = x[i2][0] - x[i1][0];
+    vb1y = x[i2][1] - x[i1][1];
+    vb1z = x[i2][2] - x[i1][2];
+
+    // 2nd bond
+
+    vb2x = x[i3][0] - x[i1][0];
+    vb2y = x[i3][1] - x[i1][1];
+    vb2z = x[i3][2] - x[i1][2];
+
+    // 3rd bond
+
+    vb3x = x[i4][0] - x[i1][0];
+    vb3y = x[i4][1] - x[i1][1];
+    vb3z = x[i4][2] - x[i1][2];
+
+    // c0 calculation
+    // A = vb1 X vb2 is perpendicular to IJK plane
+
+    ax = vb1y*vb2z-vb1z*vb2y;
+    ay = vb1z*vb2x-vb1x*vb2z;
+    az = vb1x*vb2y-vb1y*vb2x;
+    ra2 = ax*ax+ay*ay+az*az;
+    rh2 = vb3x*vb3x+vb3y*vb3y+vb3z*vb3z;
+    ra = sqrt(ra2);
+    rh = sqrt(rh2);
+    if (ra < SMALL) ra = SMALL;
+    if (rh < SMALL) rh = SMALL;
+
+    rar = 1/ra;
+    rhr = 1/rh;
+    arx = ax*rar;
+    ary = ay*rar;
+    arz = az*rar;
+    hrx = vb3x*rhr;
+    hry = vb3y*rhr;
+    hrz = vb3z*rhr;
+
+    c = arx*hrx+ary*hry+arz*hrz;
+
+    // error check
+
+    if (c > 1.0 + TOLERANCE || c < (-1.0 - TOLERANCE)) {
+      int me = comm->me;
+
+      if (screen) {
+        char str[128];
+        sprintf(str,"Improper problem: %d/%d " BIGINT_FORMAT " %d %d %d %d",
+                me,thr->get_tid(),update->ntimestep,
+                atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+        error->warning(FLERR,str,0);
+        fprintf(screen,"  1st atom: %d %g %g %g\n",
+                me,x[i1][0],x[i1][1],x[i1][2]);
+        fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                me,x[i2][0],x[i2][1],x[i2][2]);
+        fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                me,x[i3][0],x[i3][1],x[i3][2]);
+        fprintf(screen,"  4th atom: %d %g %g %g\n",
+                me,x[i4][0],x[i4][1],x[i4][2]);
+      }
+    }
+
+    if (c > 1.0) s = 1.0;
+    if (c < -1.0) s = -1.0;
+
+    s = sqrt(1.0 - c*c);
+    if (s < SMALL) s = SMALL;
+    cotphi = c/s;
+
+    projhfg = (vb3x*vb1x+vb3y*vb1y+vb3z*vb1z) /
+      sqrt(vb1x*vb1x+vb1y*vb1y+vb1z*vb1z);
+    projhfg += (vb3x*vb2x+vb3y*vb2y+vb3z*vb2z) /
+      sqrt(vb2x*vb2x+vb2y*vb2y+vb2z*vb2z);
+    if (projhfg > 0.0) {
+      s *= -1.0;
+      cotphi *= -1.0;
+    }
+
+    //  force and energy
+    // if w0 = 0: E = k * (1 - cos w)
+    // if w0 != 0: E = 0.5 * C (cos w - cos w0)^2, C = k/(sin(w0)^2
+
+    if (w0[type] == 0.0) {
+      if (EFLAG) eimproper = kw[type] * (1.0-s);
+      a = -kw[type];
+    } else {
+      domega = s - cos(w0[type]);
+      a = 0.5 * C[type] * domega;
+      if (EFLAG) eimproper = a * domega;
+      a *= 2.0;
+    }
+
+    // dhax = diffrence between H and A in X direction, etc
+
+    a = a*cotphi;
+    dhax = hrx-c*arx;
+    dhay = hry-c*ary;
+    dhaz = hrz-c*arz;
+
+    dahx = arx-c*hrx;
+    dahy = ary-c*hry;
+    dahz = arz-c*hrz;
+
+    f2[0] = (dhay*vb1z - dhaz*vb1y)*rar;
+    f2[1] = (dhaz*vb1x - dhax*vb1z)*rar;
+    f2[2] = (dhax*vb1y - dhay*vb1x)*rar;
+
+    f3[0] = (-dhay*vb2z + dhaz*vb2y)*rar;
+    f3[1] = (-dhaz*vb2x + dhax*vb2z)*rar;
+    f3[2] = (-dhax*vb2y + dhay*vb2x)*rar;
+
+    f4[0] = dahx*rhr;
+    f4[1] = dahy*rhr;
+    f4[2] = dahz*rhr;
+
+    f1[0] = -(f2[0] + f3[0] + f4[0]);
+    f1[1] = -(f2[1] + f3[1] + f4[1]);
+    f1[2] = -(f2[2] + f3[2] + f4[2]);
+
+    // apply force to each of 4 atoms
+
+    if (NEWTON_BOND || i1 < nlocal) {
+      f[i1][0] += f1[0]*a;
+      f[i1][1] += f1[1]*a;
+      f[i1][2] += f1[2]*a;
+    }
+
+    if (NEWTON_BOND || i2 < nlocal) {
+      f[i2][0] += f3[0]*a;
+      f[i2][1] += f3[1]*a;
+      f[i2][2] += f3[2]*a;
+    }
+
+    if (NEWTON_BOND || i3 < nlocal) {
+      f[i3][0] += f2[0]*a;
+      f[i3][1] += f2[1]*a;
+      f[i3][2] += f2[2]*a;
+    }
+
+    if (NEWTON_BOND || i4 < nlocal) {
+      f[i4][0] += f4[0]*a;
+      f[i4][1] += f4[1]*a;
+      f[i4][2] += f4[2]*a;
+    }
+
+    if (EVFLAG)
+      ev_tally_thr(this,i1,i2,i3,i4,nlocal,NEWTON_BOND,eimproper,f1,f3,f4,
+                   vb1x,vb1y,vb1z,vb2x,vb2y,vb2z,vb3x,vb3y,vb3z,thr);
+  }
+}
diff --git a/src/USER-OMP/improper_umbrella_omp.h b/src/USER-OMP/improper_umbrella_omp.h
new file mode 100644
index 0000000000..41bdfed4d5
--- /dev/null
+++ b/src/USER-OMP/improper_umbrella_omp.h
@@ -0,0 +1,46 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(umbrella/omp,ImproperUmbrellaOMP)
+
+#else
+
+#ifndef LMP_IMPROPER_UMBRELLA_OMP_H
+#define LMP_IMPROPER_UMBRELLA_OMP_H
+
+#include "improper_umbrella.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class ImproperUmbrellaOMP : public ImproperUmbrella, public ThrOMP {
+
+ public:
+  ImproperUmbrellaOMP(class LAMMPS *lmp);
+  virtual void compute(int, int);
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/neigh_derive_omp.cpp b/src/USER-OMP/neigh_derive_omp.cpp
new file mode 100644
index 0000000000..4356f17b62
--- /dev/null
+++ b/src/USER-OMP/neigh_derive_omp.cpp
@@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neighbor_omp.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   build half list from full list
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+   works if full list is a skip list
+------------------------------------------------------------------------- */
+
+void Neighbor::half_from_full_no_newton_omp(NeighList *list)
+{
+  const int inum_full = list->listfull->inum;
+
+  NEIGH_OMP_INIT;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(inum_full);
+
+  int i,j,ii,jj,n,jnum,joriginal;
+  int *neighptr,*jlist;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int *ilist_full = list->listfull->ilist;
+  int *numneigh_full = list->listfull->numneigh;
+  int **firstneigh_full = list->listfull->firstneigh;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over atoms in full list
+
+  for (ii = ifrom; ii < ito; ii++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      // only one thread at a time may check whether we
+      // need new neighbor list pages and then add to them.
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    // loop over parent full list
+
+    i = ilist_full[ii];
+    jlist = firstneigh_full[i];
+    jnum = numneigh_full[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      joriginal = jlist[jj];
+      j = joriginal & NEIGHMASK;
+      if (j > i) neighptr[n++] = joriginal;
+    }
+
+    ilist[ii] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = inum_full;
+}
+
+/* ----------------------------------------------------------------------
+   build half list from full list
+   pair stored once if i,j are both owned and i < j
+   if j is ghost, only store if j coords are "above and to the right" of i
+   works if full list is a skip list
+------------------------------------------------------------------------- */
+
+void Neighbor::half_from_full_newton_omp(NeighList *list)
+{
+  const int inum_full = list->listfull->inum;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(inum_full);
+
+  int i,j,ii,jj,n,jnum,joriginal;
+  int *neighptr,*jlist;
+  double xtmp,ytmp,ztmp;
+
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int *ilist_full = list->listfull->ilist;
+  int *numneigh_full = list->listfull->numneigh;
+  int **firstneigh_full = list->listfull->firstneigh;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over parent full list
+
+  for (ii = ifrom; ii < ito; ii++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      // only one thread at a time may check  whether we
+      // need new neighbor list pages and then add to them.
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    i = ilist_full[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over full neighbor list
+
+    jlist = firstneigh_full[i];
+    jnum = numneigh_full[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      joriginal = jlist[jj];
+      j = joriginal & NEIGHMASK;
+      if (j < nlocal) {
+        if (i > j) continue;
+      } else {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+        }
+      }
+      neighptr[n++] = joriginal;
+    }
+
+    ilist[ii] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = inum_full;
+}
diff --git a/src/USER-OMP/neigh_full_omp.cpp b/src/USER-OMP/neigh_full_omp.cpp
new file mode 100644
index 0000000000..8ba2ffaf2b
--- /dev/null
+++ b/src/USER-OMP/neigh_full_omp.cpp
@@ -0,0 +1,579 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neighbor_omp.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "group.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   N^2 search for all neighbors
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void Neighbor::full_nsq_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,n,itype,jtype,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int nall = atom->nlocal + atom->nghost;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over owned atoms, storing neighbors
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms, owned and ghost
+    // skip i = j
+
+    for (j = 0; j < nall; j++) {
+      if (includegroup && !(mask[j] & bitmask)) continue;
+      if (i == j) continue;
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  list->gnum = 0;
+}
+
+/* ----------------------------------------------------------------------
+   N^2 search for all neighbors
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void Neighbor::full_nsq_ghost_omp(NeighList *list)
+{
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nall);
+
+  int i,j,n,itype,jtype,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over owned & ghost atoms, storing neighbors
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms, owned and ghost
+    // skip i = j
+    // no molecular test when i = ghost atom
+
+    if (i < nlocal) {
+      for (j = 0; j < nall; j++) {
+        if (i == j) continue;
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    } else {
+      for (j = 0; j < nall; j++) {
+        if (i == j) continue;
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighghostsq[itype][jtype]) neighptr[n++] = j;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  list->gnum = nall - nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void Neighbor::full_bin_omp(NeighList *list)
+{
+  // bin owned & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over owned atoms, storing neighbors
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in surrounding bins in stencil including self
+    // skip i = j
+
+    ibin = coord2bin(x[i]);
+
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (i == j) continue;
+
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  list->gnum = 0;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void Neighbor::full_bin_ghost_omp(NeighList *list)
+{
+  // bin owned & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nall);
+
+  int i,j,k,n,itype,jtype,ibin,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int xbin,ybin,zbin,xbin2,ybin2,zbin2;
+  int *neighptr;
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+  int **stencilxyz = list->stencilxyz;
+
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over owned & ghost atoms, storing neighbors
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in surrounding bins in stencil including self
+    // when i is a ghost atom, must check if stencil bin is out of bounds
+    // skip i = j
+    // no molecular test when i = ghost atom
+
+    if (i < nlocal) {
+      ibin = coord2bin(x[i]);
+      for (k = 0; k < nstencil; k++) {
+        for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+          if (i == j) continue;
+
+          jtype = type[j];
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          if (rsq <= cutneighsq[itype][jtype]) {
+            if (molecular) {
+              which = find_special(special[i],nspecial[i],tag[j]);
+              if (which == 0) neighptr[n++] = j;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = j;
+              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            } else neighptr[n++] = j;
+          }
+        }
+      }
+
+    } else {
+      ibin = coord2bin(x[i],xbin,ybin,zbin);
+      for (k = 0; k < nstencil; k++) {
+        xbin2 = xbin + stencilxyz[k][0];
+        ybin2 = ybin + stencilxyz[k][1];
+        zbin2 = zbin + stencilxyz[k][2];
+        if (xbin2 < 0 || xbin2 >= mbinx ||
+            ybin2 < 0 || ybin2 >= mbiny ||
+            zbin2 < 0 || zbin2 >= mbinz) continue;
+        for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+          if (i == j) continue;
+
+          jtype = type[j];
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          if (rsq <= cutneighghostsq[itype][jtype]) neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  list->gnum = nall - nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction for all neighbors
+   multi-type stencil is itype dependent and is distance checked
+   every neighbor pair appears in list of both atoms i and j
+------------------------------------------------------------------------- */
+
+void Neighbor::full_multi_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which,ns;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*s;
+  double *cutsq,*distsq;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int *nstencil_multi = list->nstencil_multi;
+  int **stencil_multi = list->stencil_multi;
+  double **distsq_multi = list->distsq_multi;
+
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in other bins in stencil, including self
+    // skip if i,j neighbor cutoff is less than bin distance
+    // skip i = j
+
+    ibin = coord2bin(x[i]);
+    s = stencil_multi[itype];
+    distsq = distsq_multi[itype];
+    cutsq = cutneighsq[itype];
+    ns = nstencil_multi[itype];
+    for (k = 0; k < ns; k++) {
+      for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
+        jtype = type[j];
+        if (cutsq[jtype] < distsq[k]) continue;
+        if (i == j) continue;
+
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  list->gnum = 0;
+}
diff --git a/src/USER-OMP/neigh_gran_omp.cpp b/src/USER-OMP/neigh_gran_omp.cpp
new file mode 100644
index 0000000000..71fee4ec4f
--- /dev/null
+++ b/src/USER-OMP/neigh_gran_omp.cpp
@@ -0,0 +1,657 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neighbor_omp.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "group.h"
+#include "fix_shear_history.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   granular particles
+   N^2 / 2 search for neighbor pairs with partial Newton's 3rd law
+   shear history must be accounted for when a neighbor pair is added
+   pair added to list if atoms i and j are both owned and i < j
+   pair added if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::granular_nsq_no_newton_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+
+  FixShearHistory * const fix_history = list->fix_history;
+  NeighList * listgranhistory = list->listgranhistory;
+
+  NEIGH_OMP_INIT;
+
+  if (fix_history)
+    if (nthreads > listgranhistory->maxpage)
+      listgranhistory->add_pages(nthreads - listgranhistory->maxpage);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list,listgranhistory)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,m,n,nn;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  double radi,radsum,cutsq;
+  int *neighptr,*touchptr;
+  double *shearptr;
+
+  int *npartner,**partner;
+  double ***shearpartner;
+  int **firsttouch;
+  double **firstshear;
+
+  double **x = atom->x;
+  double *radius = atom->radius;
+  int *tag = atom->tag;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int nall = atom->nlocal + atom->nghost;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  if (fix_history) {
+    npartner = fix_history->npartner;
+    partner = fix_history->partner;
+    shearpartner = fix_history->shearpartner;
+    firsttouch = listgranhistory->firstneigh;
+    firstshear = listgranhistory->firstdouble;
+  }
+
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    {
+      if (pgsize - npnt < oneatom) {
+        npnt = 0;
+        npage += nthreads;
+        if (npage >= list->maxpage) {
+          list->add_pages(nthreads);
+          if (fix_history)
+            listgranhistory->add_pages(nthreads);
+        }
+      }
+
+      n = nn = 0;
+      neighptr = &(list->pages[npage][npnt]);
+      if (fix_history) {
+        touchptr = &(listgranhistory->pages[npage][npnt]);
+        shearptr = &(listgranhistory->dpages[npage][3*npnt]);
+      }
+    }
+
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+
+    // loop over remaining atoms, owned and ghost
+
+    for (j = i+1; j < nall; j++) {
+      if (includegroup && !(mask[j] & bitmask)) continue;
+      if (exclude && exclusion(i,j,type[i],type[j],mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radsum = radi + radius[j];
+      cutsq = (radsum+skin) * (radsum+skin);
+
+      if (rsq <= cutsq) {
+        neighptr[n] = j;
+
+        if (fix_history) {
+          if (rsq < radsum*radsum) {
+            for (m = 0; m < npartner[i]; m++)
+              if (partner[i][m] == tag[j]) break;
+            if (m < npartner[i]) {
+              touchptr[n] = 1;
+              shearptr[nn++] = shearpartner[i][m][0];
+              shearptr[nn++] = shearpartner[i][m][1];
+              shearptr[nn++] = shearpartner[i][m][2];
+            } else {
+              touchptr[n] = 0;
+              shearptr[nn++] = 0.0;
+              shearptr[nn++] = 0.0;
+              shearptr[nn++] = 0.0;
+            }
+          } else {
+            touchptr[n] = 0;
+            shearptr[nn++] = 0.0;
+            shearptr[nn++] = 0.0;
+            shearptr[nn++] = 0.0;
+          }
+        }
+
+        n++;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    if (fix_history) {
+      firsttouch[i] = touchptr;
+      firstshear[i] = shearptr;
+    }
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   granular particles
+   N^2 / 2 search for neighbor pairs with full Newton's 3rd law
+   no shear history is allowed for this option
+   pair added to list if atoms i and j are both owned and i < j
+   if j is ghost only me or other proc adds pair
+   decision based on itag,jtag tests
+------------------------------------------------------------------------- */
+
+void Neighbor::granular_nsq_newton_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,n,itag,jtag;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  double radi,radsum,cutsq;
+  int *neighptr;
+
+  double **x = atom->x;
+  double *radius = atom->radius;
+  int *tag = atom->tag;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int nall = atom->nlocal + atom->nghost;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itag = tag[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+
+    // loop over remaining atoms, owned and ghost
+
+    for (j = i+1; j < nall; j++) {
+      if (includegroup && !(mask[j] & bitmask)) continue;
+
+      if (j >= nlocal) {
+        jtag = tag[j];
+        if (itag > jtag) {
+          if ((itag+jtag) % 2 == 0) continue;
+        } else if (itag < jtag) {
+          if ((itag+jtag) % 2 == 1) continue;
+        } else {
+          if (x[j][2] < ztmp) continue;
+          if (x[j][2] == ztmp) {
+            if (x[j][1] < ytmp) continue;
+            if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+          }
+        }
+      }
+
+      if (exclude && exclusion(i,j,type[i],type[j],mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radsum = radi + radius[j];
+      cutsq = (radsum+skin) * (radsum+skin);
+
+      if (rsq <= cutsq) neighptr[n++] = j;
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   granular particles
+   binned neighbor list construction with partial Newton's 3rd law
+   shear history must be accounted for when a neighbor pair is added
+   each owned atom i checks own bin and surrounding bins in non-Newton stencil
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::granular_bin_no_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  FixShearHistory * const fix_history = list->fix_history;
+  NeighList * listgranhistory = list->listgranhistory;
+
+  NEIGH_OMP_INIT;
+
+  if (fix_history)
+    if (nthreads > listgranhistory->maxpage)
+      listgranhistory->add_pages(nthreads - listgranhistory->maxpage);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list,listgranhistory)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,m,n,nn,ibin;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  double radi,radsum,cutsq;
+  int *neighptr,*touchptr;
+  double *shearptr;
+
+  int *npartner,**partner;
+  double ***shearpartner;
+  int **firsttouch;
+  double **firstshear;
+
+  // loop over each atom, storing neighbors
+
+  double **x = atom->x;
+  double *radius = atom->radius;
+  int *tag = atom->tag;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  if (fix_history) {
+    npartner = fix_history->npartner;
+    partner = fix_history->partner;
+    shearpartner = fix_history->shearpartner;
+    firsttouch = listgranhistory->firstneigh;
+    firstshear = listgranhistory->firstdouble;
+  }
+
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    {
+      if (pgsize - npnt < oneatom) {
+        npnt = 0;
+        npage += nthreads;
+        if (npage >= list->maxpage) {
+          list->add_pages(nthreads);
+          if (fix_history)
+            listgranhistory->add_pages(nthreads);
+        }
+      }
+
+      n = nn = 0;
+      neighptr = &(list->pages[npage][npnt]);
+      if (fix_history) {
+        touchptr = &(listgranhistory->pages[npage][npnt]);
+        shearptr = &(listgranhistory->dpages[npage][3*npnt]);
+      }
+    }
+
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+    ibin = coord2bin(x[i]);
+
+    // loop over all atoms in surrounding bins in stencil including self
+    // only store pair if i < j
+    // stores own/own pairs only once
+    // stores own/ghost pairs on both procs
+
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (j <= i) continue;
+        if (exclude && exclusion(i,j,type[i],type[j],mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+        radsum = radi + radius[j];
+        cutsq = (radsum+skin) * (radsum+skin);
+
+        if (rsq <= cutsq) {
+          neighptr[n] = j;
+
+          if (fix_history) {
+            if (rsq < radsum*radsum) {
+              for (m = 0; m < npartner[i]; m++)
+                if (partner[i][m] == tag[j]) break;
+              if (m < npartner[i]) {
+                touchptr[n] = 1;
+                shearptr[nn++] = shearpartner[i][m][0];
+                shearptr[nn++] = shearpartner[i][m][1];
+                shearptr[nn++] = shearpartner[i][m][2];
+              } else {
+                touchptr[n] = 0;
+                shearptr[nn++] = 0.0;
+                shearptr[nn++] = 0.0;
+                shearptr[nn++] = 0.0;
+              }
+            } else {
+              touchptr[n] = 0;
+              shearptr[nn++] = 0.0;
+              shearptr[nn++] = 0.0;
+              shearptr[nn++] = 0.0;
+            }
+          }
+
+          n++;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    if (fix_history) {
+      firsttouch[i] = touchptr;
+      firstshear[i] = shearptr;
+    }
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   granular particles
+   binned neighbor list construction with full Newton's 3rd law
+   no shear history is allowed for this option
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::granular_bin_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,ibin;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  double radi,radsum,cutsq;
+  int *neighptr;
+
+  // loop over each atom, storing neighbors
+
+  double **x = atom->x;
+  double *radius = atom->radius;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    n = 0;
+    neighptr = &(list->pages[npage][npnt]);
+
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+
+    // loop over rest of atoms in i's bin, ghosts are at end of linked list
+    // if j is owned atom, store it, since j is beyond i in linked list
+    // if j is ghost, only store if j coords are "above and to the right" of i
+
+    for (j = bins[i]; j >= 0; j = bins[j]) {
+      if (j >= nlocal) {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+        }
+      }
+
+      if (exclude && exclusion(i,j,type[i],type[j],mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radsum = radi + radius[j];
+      cutsq = (radsum+skin) * (radsum+skin);
+
+      if (rsq <= cutsq) neighptr[n++] = j;
+    }
+
+    // loop over all atoms in other bins in stencil, store every pair
+
+    ibin = coord2bin(x[i]);
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (exclude && exclusion(i,j,type[i],type[j],mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+        radsum = radi + radius[j];
+        cutsq = (radsum+skin) * (radsum+skin);
+
+        if (rsq <= cutsq) neighptr[n++] = j;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   granular particles
+   binned neighbor list construction with Newton's 3rd law for triclinic
+   no shear history is allowed for this option
+   each owned atom i checks its own bin and other bins in triclinic stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::granular_bin_newton_tri_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,ibin;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  double radi,radsum,cutsq;
+  int *neighptr;
+
+  // loop over each atom, storing neighbors
+
+  double **x = atom->x;
+  double *radius = atom->radius;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    n = 0;
+    neighptr = &(list->pages[npage][npnt]);
+
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+
+    // loop over all atoms in bins in stencil
+    // pairs for atoms j "below" i are excluded
+    // below = lower z or (equal z and lower y) or (equal zy and lower x)
+    //         (equal zyx and j <= i)
+    // latter excludes self-self interaction but allows superposed atoms
+
+    ibin = coord2bin(x[i]);
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp) {
+            if (x[j][0] < xtmp) continue;
+            if (x[j][0] == xtmp && j <= i) continue;
+          }
+        }
+
+        if (exclude && exclusion(i,j,type[i],type[j],mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+        radsum = radi + radius[j];
+        cutsq = (radsum+skin) * (radsum+skin);
+
+        if (rsq <= cutsq) neighptr[n++] = j;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
diff --git a/src/USER-OMP/neigh_half_bin_omp.cpp b/src/USER-OMP/neigh_half_bin_omp.cpp
new file mode 100644
index 0000000000..ba0a180410
--- /dev/null
+++ b/src/USER-OMP/neigh_half_bin_omp.cpp
@@ -0,0 +1,522 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neighbor_omp.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with partial Newton's 3rd law
+   each owned atom i checks own bin and other bins in stencil
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_no_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in other bins in stencil including self
+    // only store pair if i < j
+    // stores own/own pairs only once
+    // stores own/ghost pairs on both procs
+
+    ibin = coord2bin(x[i]);
+
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (j <= i) continue;
+
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with partial Newton's 3rd law
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   owned and ghost atoms check own bin and other bins in stencil
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if i owned and j ghost (also stored by proc owning j)
+   pair stored once if i,j are both ghost and i < j
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_no_newton_ghost_omp(NeighList *list)
+{
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nall);
+
+  int i,j,k,n,itype,jtype,ibin,which;
+  int xbin,ybin,zbin,xbin2,ybin2,zbin2;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+  int **stencilxyz = list->stencilxyz;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in other bins in stencil including self
+    // when i is a ghost atom, must check if stencil bin is out of bounds
+    // only store pair if i < j
+    // stores own/own pairs only once
+    // stores own/ghost pairs with owned atom only, on both procs
+    // stores ghost/ghost pairs only once
+    // no molecular test when i = ghost atom
+
+    if (i < nlocal) {
+      ibin = coord2bin(x[i]);
+
+      for (k = 0; k < nstencil; k++) {
+        for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+          if (j <= i) continue;
+
+          jtype = type[j];
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          if (rsq <= cutneighsq[itype][jtype]) {
+            if (molecular) {
+              which = find_special(special[i],nspecial[i],tag[j]);
+              if (which == 0) neighptr[n++] = j;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = j;
+              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            } else neighptr[n++] = j;
+          }
+        }
+      }
+
+    } else {
+      ibin = coord2bin(x[i],xbin,ybin,zbin);
+      for (k = 0; k < nstencil; k++) {
+        xbin2 = xbin + stencilxyz[k][0];
+        ybin2 = ybin + stencilxyz[k][1];
+        zbin2 = zbin + stencilxyz[k][2];
+        if (xbin2 < 0 || xbin2 >= mbinx ||
+            ybin2 < 0 || ybin2 >= mbiny ||
+            zbin2 < 0 || zbin2 >= mbinz) continue;
+        for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+          if (j <= i) continue;
+
+          jtype = type[j];
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          if (rsq <= cutneighghostsq[itype][jtype]) neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  list->gnum = nall - atom->nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with full Newton's 3rd law
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over rest of atoms in i's bin, ghosts are at end of linked list
+    // if j is owned atom, store it, since j is beyond i in linked list
+    // if j is ghost, only store if j coords are "above and to the right" of i
+
+    for (j = bins[i]; j >= 0; j = bins[j]) {
+      if (j >= nlocal) {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+        }
+      }
+
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          // OLD: if (which >= 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+      }
+    }
+
+    // loop over all atoms in other bins in stencil, store every pair
+
+    ibin = coord2bin(x[i]);
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            // OLD: if (which >= 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with Newton's 3rd law for triclinic
+   each owned atom i checks its own bin and other bins in triclinic stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_newton_tri_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in bins in stencil
+    // pairs for atoms j "below" i are excluded
+    // below = lower z or (equal z and lower y) or (equal zy and lower x)
+    //         (equal zyx and j <= i)
+    // latter excludes self-self interaction but allows superposed atoms
+
+    ibin = coord2bin(x[i]);
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp) {
+            if (x[j][0] < xtmp) continue;
+            if (x[j][0] == xtmp && j <= i) continue;
+          }
+        }
+
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
diff --git a/src/USER-OMP/neigh_half_multi_omp.cpp b/src/USER-OMP/neigh_half_multi_omp.cpp
new file mode 100644
index 0000000000..5cc6ba1401
--- /dev/null
+++ b/src/USER-OMP/neigh_half_multi_omp.cpp
@@ -0,0 +1,405 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neighbor_omp.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with partial Newton's 3rd law
+   each owned atom i checks own bin and other bins in stencil
+   multi-type stencil is itype dependent and is distance checked
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::half_multi_no_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which,ns;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*s;
+  double *cutsq,*distsq;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int *nstencil_multi = list->nstencil_multi;
+  int **stencil_multi = list->stencil_multi;
+  double **distsq_multi = list->distsq_multi;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in other bins in stencil including self
+    // only store pair if i < j
+    // skip if i,j neighbor cutoff is less than bin distance
+    // stores own/own pairs only once
+    // stores own/ghost pairs on both procs
+
+    ibin = coord2bin(x[i]);
+    s = stencil_multi[itype];
+    distsq = distsq_multi[itype];
+    cutsq = cutneighsq[itype];
+    ns = nstencil_multi[itype];
+    for (k = 0; k < ns; k++) {
+      for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
+        if (j <= i) continue;
+        jtype = type[j];
+        if (cutsq[jtype] < distsq[k]) continue;
+
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with full Newton's 3rd law
+   each owned atom i checks its own bin and other bins in Newton stencil
+   multi-type stencil is itype dependent and is distance checked
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_multi_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which,ns;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*s;
+  double *cutsq,*distsq;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int *nstencil_multi = list->nstencil_multi;
+  int **stencil_multi = list->stencil_multi;
+  double **distsq_multi = list->distsq_multi;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over rest of atoms in i's bin, ghosts are at end of linked list
+    // if j is owned atom, store it, since j is beyond i in linked list
+    // if j is ghost, only store if j coords are "above and to the right" of i
+
+    for (j = bins[i]; j >= 0; j = bins[j]) {
+      if (j >= nlocal) {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+        }
+      }
+
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+      }
+    }
+
+    // loop over all atoms in other bins in stencil, store every pair
+    // skip if i,j neighbor cutoff is less than bin distance
+
+    ibin = coord2bin(x[i]);
+    s = stencil_multi[itype];
+    distsq = distsq_multi[itype];
+    cutsq = cutneighsq[itype];
+    ns = nstencil_multi[itype];
+    for (k = 0; k < ns; k++) {
+      for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
+        jtype = type[j];
+        if (cutsq[jtype] < distsq[k]) continue;
+
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with Newton's 3rd law for triclinic
+   each owned atom i checks its own bin and other bins in triclinic stencil
+   multi-type stencil is itype dependent and is distance checked
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_multi_newton_tri_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,which,ns;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*s;
+  double *cutsq,*distsq;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int *nstencil_multi = list->nstencil_multi;
+  int **stencil_multi = list->stencil_multi;
+  double **distsq_multi = list->distsq_multi;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in bins, including self, in stencil
+    // skip if i,j neighbor cutoff is less than bin distance
+    // bins below self are excluded from stencil
+    // pairs for atoms j "below" i are excluded
+    // below = lower z or (equal z and lower y) or (equal zy and lower x)
+    //         (equal zyx and j <= i)
+    // latter excludes self-self interaction but allows superposed atoms
+
+    ibin = coord2bin(x[i]);
+    s = stencil_multi[itype];
+    distsq = distsq_multi[itype];
+    cutsq = cutneighsq[itype];
+    ns = nstencil_multi[itype];
+    for (k = 0; k < ns; k++) {
+      for (j = binhead[ibin+s[k]]; j >= 0; j = bins[j]) {
+        jtype = type[j];
+        if (cutsq[jtype] < distsq[k]) continue;
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp) {
+            if (x[j][0] < xtmp) continue;
+            if (x[j][0] == xtmp && j <= i) continue;
+          }
+        }
+
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
diff --git a/src/USER-OMP/neigh_half_nsq_omp.cpp b/src/USER-OMP/neigh_half_nsq_omp.cpp
new file mode 100644
index 0000000000..6a8bee491a
--- /dev/null
+++ b/src/USER-OMP/neigh_half_nsq_omp.cpp
@@ -0,0 +1,350 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neighbor_omp.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "group.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   N^2 / 2 search for neighbor pairs with partial Newton's 3rd law
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::half_nsq_no_newton_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+  const int nall = atom->nlocal + atom->nghost;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,n,itype,jtype,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over owned atoms, storing neighbors
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over remaining atoms, owned and ghost
+    // only store pair if i < j
+
+    for (j = i+1; j < nall; j++) {
+      if (includegroup && !(mask[j] & bitmask)) continue;
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   N^2 / 2 search for neighbor pairs with partial Newton's 3rd law
+   include neighbors of ghost atoms, but no "special neighbors" for ghosts
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if i owned and j ghost (also stored by proc owning j)
+   pair stored once if i,j are both ghost and i < j
+------------------------------------------------------------------------- */
+
+void Neighbor::half_nsq_no_newton_ghost_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+  const int nall = nlocal + atom->nghost;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nall);
+
+  int i,j,n,itype,jtype,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  // each thread works on its own page
+  int npage = tid;
+  int npnt = 0;
+
+  // loop over owned & ghost atoms, storing neighbors
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over remaining atoms, owned and ghost
+    // only store pair if i < j
+    // stores own/own pairs only once
+    // stores own/ghost pairs with owned atom only, on both procs
+    // stores ghost/ghost pairs only once
+    // no molecular test when i = ghost atom
+
+    if (i < nlocal) {
+      for (j = i+1; j < nall; j++) {
+        if (includegroup && !(mask[j] & bitmask)) continue;
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+        }
+      }
+
+    } else {
+      for (j = i+1; j < nall; j++) {
+        if (includegroup && !(mask[j] & bitmask)) continue;
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) neighptr[n++] = j;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = atom->nlocal;
+  list->gnum = nall - atom->nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   N^2 / 2 search for neighbor pairs with full Newton's 3rd law
+   every pair stored exactly once by some processor
+   decision on ghost atoms based on itag,jtag tests
+------------------------------------------------------------------------- */
+
+void Neighbor::half_nsq_newton_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+
+  NEIGH_OMP_INIT;
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,n,itype,jtype,itag,jtag,which;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int nall = atom->nlocal + atom->nghost;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  int npage = tid;
+  int npnt = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage >= list->maxpage) list->add_pages(nthreads);
+    }
+
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+    itag = tag[i];
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over remaining atoms, owned and ghost
+    // itag = jtag is possible for long cutoffs that include images of self
+
+    for (j = i+1; j < nall; j++) {
+      if (includegroup && !(mask[j] & bitmask)) continue;
+
+      if (j >= nlocal) {
+        jtag = tag[j];
+        if (itag > jtag) {
+          if ((itag+jtag) % 2 == 0) continue;
+        } else if (itag < jtag) {
+          if ((itag+jtag) % 2 == 1) continue;
+        } else {
+          if (x[j][2] < ztmp) continue;
+          if (x[j][2] == ztmp) {
+            if (x[j][1] < ytmp) continue;
+            if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+          }
+        }
+      }
+
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+}
diff --git a/src/USER-OMP/neigh_respa_omp.cpp b/src/USER-OMP/neigh_respa_omp.cpp
new file mode 100644
index 0000000000..8c4b15d505
--- /dev/null
+++ b/src/USER-OMP/neigh_respa_omp.cpp
@@ -0,0 +1,1031 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neighbor_omp.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "group.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ----------------------------------------------------------------------
+   multiple respa lists
+   N^2 / 2 search for neighbor pairs with partial Newton's 3rd law
+   pair added to list if atoms i and j are both owned and i < j
+   pair added if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::respa_nsq_no_newton_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+
+  NEIGH_OMP_INIT;
+
+  NeighList *listinner = list->listinner;
+  if (nthreads > listinner->maxpage)
+    listinner->add_pages(nthreads - listinner->maxpage);
+
+  NeighList *listmiddle;
+  const int respamiddle = list->respamiddle;
+  if (respamiddle) {
+    listmiddle = list->listmiddle;
+    if (nthreads > listmiddle->maxpage)
+      listmiddle->add_pages(nthreads - listmiddle->maxpage);
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list,listinner,listmiddle)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,n,itype,jtype,n_inner,n_middle;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*neighptr_inner,*neighptr_middle;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int nall = atom->nlocal + atom->nghost;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  int *ilist_inner = listinner->ilist;
+  int *numneigh_inner = listinner->numneigh;
+  int **firstneigh_inner = listinner->firstneigh;
+
+  int *ilist_middle,*numneigh_middle,**firstneigh_middle;
+  if (respamiddle) {
+    ilist_middle = listmiddle->ilist;
+    numneigh_middle = listmiddle->numneigh;
+    firstneigh_middle = listmiddle->firstneigh;
+  }
+
+  int npage = tid;
+  int npnt = 0;
+  int npage_inner = tid;
+  int npnt_inner = 0;
+  int npage_middle = tid;
+  int npnt_middle = 0;
+
+  int which = 0;
+  int minchange = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage == list->maxpage) list->add_pages(nthreads);
+    }
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt_inner < oneatom) {
+      npnt_inner = 0;
+      npage_inner += nthreads;
+      if (npage_inner == listinner->maxpage) listinner->add_pages(nthreads);
+    }
+    neighptr_inner = &(listinner->pages[npage_inner][npnt_inner]);
+    n_inner = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (respamiddle) {
+      if (pgsize - npnt_middle < oneatom) {
+        npnt_middle = 0;
+        npage_middle += nthreads;
+        if (npage_middle == listmiddle->maxpage) listmiddle->add_pages(nthreads);
+      }
+      neighptr_middle = &(listmiddle->pages[npage_middle][npnt_middle]);
+      n_middle = 0;
+    }
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over remaining atoms, owned and ghost
+
+    for (j = i+1; j < nall; j++) {
+      if (includegroup && !(mask[j] & bitmask)) continue;
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (minchange = domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+
+        if (rsq < cut_inner_sq) {
+          if (which == 0) neighptr_inner[n_inner++] = j;
+          else if (minchange) neighptr_inner[n_inner++] = j;
+          else if (which > 0) neighptr_inner[n_inner++] = j ^ (which << SBBITS);
+        }
+
+        if (respamiddle && rsq < cut_middle_sq && rsq > cut_middle_inside_sq) {
+          if (which == 0) neighptr_middle[n_middle++] = j;
+          else if (minchange) neighptr_middle[n_middle++] = j;
+          else if (which > 0)
+            neighptr_middle[n_middle++] = j ^ (which << SBBITS);
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    ilist_inner[i] = i;
+    firstneigh_inner[i] = neighptr_inner;
+    numneigh_inner[i] = n_inner;
+    npnt_inner += n_inner;
+    if (n_inner > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    if (respamiddle) {
+      ilist_middle[i] = i;
+      firstneigh_middle[i] = neighptr_middle;
+      numneigh_middle[i] = n_middle;
+      npnt_middle += n_middle;
+      if (n_middle > oneatom)
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  listinner->inum = nlocal;
+  if (respamiddle) listmiddle->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   multiple respa lists
+   N^2 / 2 search for neighbor pairs with full Newton's 3rd law
+   pair added to list if atoms i and j are both owned and i < j
+   if j is ghost only me or other proc adds pair
+   decision based on itag,jtag tests
+------------------------------------------------------------------------- */
+
+void Neighbor::respa_nsq_newton_omp(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  const int bitmask = (includegroup) ? group->bitmask[includegroup] : 0;
+
+  NEIGH_OMP_INIT;
+
+  NeighList *listinner = list->listinner;
+  if (nthreads > listinner->maxpage)
+    listinner->add_pages(nthreads - listinner->maxpage);
+
+  NeighList *listmiddle;
+  const int respamiddle = list->respamiddle;
+  if (respamiddle) {
+    listmiddle = list->listmiddle;
+    if (nthreads > listmiddle->maxpage)
+      listmiddle->add_pages(nthreads - listmiddle->maxpage);
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list,listinner,listmiddle)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,n,itype,jtype,itag,jtag,n_inner,n_middle;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*neighptr_inner,*neighptr_middle;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int nall = atom->nlocal + atom->nghost;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+
+  int *ilist_inner = listinner->ilist;
+  int *numneigh_inner = listinner->numneigh;
+  int **firstneigh_inner = listinner->firstneigh;
+
+  int *ilist_middle,*numneigh_middle,**firstneigh_middle;
+  if (respamiddle) {
+    ilist_middle = listmiddle->ilist;
+    numneigh_middle = listmiddle->numneigh;
+    firstneigh_middle = listmiddle->firstneigh;
+  }
+
+  int npage = tid;
+  int npnt = 0;
+  int npage_inner = tid;
+  int npnt_inner = 0;
+  int npage_middle = tid;
+  int npnt_middle = 0;
+
+  int which = 0;
+  int minchange = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage == list->maxpage) list->add_pages(nthreads);
+    }
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt_inner < oneatom) {
+      npnt_inner = 0;
+      npage_inner += nthreads;
+      if (npage_inner == listinner->maxpage) listinner->add_pages(nthreads);
+    }
+    neighptr_inner = &(listinner->pages[npage_inner][npnt_inner]);
+    n_inner = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (respamiddle) {
+      if (pgsize - npnt_middle < oneatom) {
+        npnt_middle = 0;
+        npage_middle += nthreads;
+        if (npage_middle == listmiddle->maxpage) listmiddle->add_pages(nthreads);
+      }
+      neighptr_middle = &(listmiddle->pages[npage_middle][npnt_middle]);
+      n_middle = 0;
+    }
+
+    itag = tag[i];
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over remaining atoms, owned and ghost
+
+    for (j = i+1; j < nall; j++) {
+      if (includegroup && !(mask[j] & bitmask)) continue;
+
+      if (j >= nlocal) {
+        jtag = tag[j];
+        if (itag > jtag) {
+          if ((itag+jtag) % 2 == 0) continue;
+        } else if (itag < jtag) {
+          if ((itag+jtag) % 2 == 1) continue;
+        } else {
+          if (x[j][2] < ztmp) continue;
+          if (x[j][2] == ztmp) {
+            if (x[j][1] < ytmp) continue;
+            if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+          }
+        }
+      }
+
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (minchange = domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+
+        if (rsq < cut_inner_sq) {
+          if (which == 0) neighptr_inner[n_inner++] = j;
+          else if (minchange) neighptr_inner[n_inner++] = j;
+          else if (which > 0) neighptr_inner[n_inner++] = j ^ (which << SBBITS);
+        }
+
+        if (respamiddle &&
+            rsq < cut_middle_sq && rsq > cut_middle_inside_sq) {
+          if (which == 0) neighptr_middle[n_middle++] = j;
+          else if (minchange) neighptr_middle[n_middle++] = j;
+          else if (which > 0)
+            neighptr_middle[n_middle++] = j ^ (which << SBBITS);
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    ilist_inner[i] = i;
+    firstneigh_inner[i] = neighptr_inner;
+    numneigh_inner[i] = n_inner;
+    npnt_inner += n_inner;
+    if (n_inner > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    if (respamiddle) {
+      ilist_middle[i] = i;
+      firstneigh_middle[i] = neighptr_middle;
+      numneigh_middle[i] = n_middle;
+      npnt_middle += n_middle;
+      if (n_middle > oneatom)
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  listinner->inum = nlocal;
+  if (respamiddle) listmiddle->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   multiple respa lists
+   binned neighbor list construction with partial Newton's 3rd law
+   each owned atom i checks own bin and surrounding bins in non-Newton stencil
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::respa_bin_no_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+
+  NeighList *listinner = list->listinner;
+  if (nthreads > listinner->maxpage)
+    listinner->add_pages(nthreads - listinner->maxpage);
+
+  NeighList *listmiddle;
+  const int respamiddle = list->respamiddle;
+  if (respamiddle) {
+    listmiddle = list->listmiddle;
+    if (nthreads > listmiddle->maxpage)
+      listmiddle->add_pages(nthreads - listmiddle->maxpage);
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list,listinner,listmiddle)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,n_inner,n_middle;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*neighptr_inner,*neighptr_middle;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  int *ilist_inner = listinner->ilist;
+  int *numneigh_inner = listinner->numneigh;
+  int **firstneigh_inner = listinner->firstneigh;
+
+  int *ilist_middle,*numneigh_middle,**firstneigh_middle;
+  if (respamiddle) {
+    ilist_middle = listmiddle->ilist;
+    numneigh_middle = listmiddle->numneigh;
+    firstneigh_middle = listmiddle->firstneigh;
+  }
+
+  int npage = tid;
+  int npnt = 0;
+  int npage_inner = tid;
+  int npnt_inner = 0;
+  int npage_middle = tid;
+  int npnt_middle = 0;
+
+  int which = 0;
+  int minchange = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage == list->maxpage) list->add_pages(nthreads);
+    }
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt_inner < oneatom) {
+      npnt_inner = 0;
+      npage_inner += nthreads;
+      if (npage_inner == listinner->maxpage) listinner->add_pages(nthreads);
+    }
+    neighptr_inner = &(listinner->pages[npage_inner][npnt_inner]);
+    n_inner = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (respamiddle) {
+      if (pgsize - npnt_middle < oneatom) {
+        npnt_middle = 0;
+        npage_middle += nthreads;
+        if (npage_middle == listmiddle->maxpage) listmiddle->add_pages(nthreads);
+      }
+      neighptr_middle = &(listmiddle->pages[npage_middle][npnt_middle]);
+      n_middle = 0;
+    }
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    ibin = coord2bin(x[i]);
+
+    // loop over all atoms in surrounding bins in stencil including self
+    // only store pair if i < j
+    // stores own/own pairs only once
+    // stores own/ghost pairs on both procs
+
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (j <= i) continue;
+
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (minchange = domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+
+          if (rsq < cut_inner_sq) {
+            if (which == 0) neighptr_inner[n_inner++] = j;
+            else if (minchange) neighptr_inner[n_inner++] = j;
+            else if (which > 0)
+              neighptr_inner[n_inner++] = j ^ (which << SBBITS);
+          }
+
+          if (respamiddle &&
+              rsq < cut_middle_sq && rsq > cut_middle_inside_sq) {
+            if (which == 0) neighptr_middle[n_middle++] = j;
+            else if (minchange) neighptr_middle[n_middle++] = j;
+            else if (which > 0)
+              neighptr_middle[n_middle++] = j ^ (which << SBBITS);
+          }
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    ilist_inner[i] = i;
+    firstneigh_inner[i] = neighptr_inner;
+    numneigh_inner[i] = n_inner;
+    npnt_inner += n_inner;
+    if (n_inner > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    if (respamiddle) {
+      ilist_middle[i] = i;
+      firstneigh_middle[i] = neighptr_middle;
+      numneigh_middle[i] = n_middle;
+      npnt_middle += n_middle;
+      if (n_middle > oneatom)
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  listinner->inum = nlocal;
+  if (respamiddle) listmiddle->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   multiple respa lists
+   binned neighbor list construction with full Newton's 3rd law
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::respa_bin_newton_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+
+  NeighList *listinner = list->listinner;
+  if (nthreads > listinner->maxpage)
+    listinner->add_pages(nthreads - listinner->maxpage);
+
+  NeighList *listmiddle;
+  const int respamiddle = list->respamiddle;
+  if (respamiddle) {
+    listmiddle = list->listmiddle;
+    if (nthreads > listmiddle->maxpage)
+      listmiddle->add_pages(nthreads - listmiddle->maxpage);
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list,listinner,listmiddle)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,n_inner,n_middle;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*neighptr_inner,*neighptr_middle;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  int *ilist_inner = listinner->ilist;
+  int *numneigh_inner = listinner->numneigh;
+  int **firstneigh_inner = listinner->firstneigh;
+
+  int *ilist_middle,*numneigh_middle,**firstneigh_middle;
+  if (respamiddle) {
+    ilist_middle = listmiddle->ilist;
+    numneigh_middle = listmiddle->numneigh;
+    firstneigh_middle = listmiddle->firstneigh;
+  }
+
+  int npage = tid;
+  int npnt = 0;
+  int npage_inner = tid;
+  int npnt_inner = 0;
+  int npage_middle = tid;
+  int npnt_middle = 0;
+
+  int which = 0;
+  int minchange = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage == list->maxpage) list->add_pages(nthreads);
+    }
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt_inner < oneatom) {
+      npnt_inner = 0;
+      npage_inner += nthreads;
+      if (npage_inner == listinner->maxpage) listinner->add_pages(nthreads);
+    }
+    neighptr_inner = &(listinner->pages[npage_inner][npnt_inner]);
+    n_inner = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (respamiddle) {
+      if (pgsize - npnt_middle < oneatom) {
+        npnt_middle = 0;
+        npage_middle += nthreads;
+        if (npage_middle == listmiddle->maxpage) listmiddle->add_pages(nthreads);
+      }
+      neighptr_middle = &(listmiddle->pages[npage_middle][npnt_middle]);
+      n_middle = 0;
+    }
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over rest of atoms in i's bin, ghosts are at end of linked list
+    // if j is owned atom, store it, since j is beyond i in linked list
+    // if j is ghost, only store if j coords are "above and to the right" of i
+
+    for (j = bins[i]; j >= 0; j = bins[j]) {
+      if (j >= nlocal) {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp && x[j][0] < xtmp) continue;
+        }
+      }
+
+      jtype = type[j];
+      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= cutneighsq[itype][jtype]) {
+        if (molecular) {
+          which = find_special(special[i],nspecial[i],tag[j]);
+          if (which == 0) neighptr[n++] = j;
+          else if (minchange = domain->minimum_image_check(delx,dely,delz))
+            neighptr[n++] = j;
+          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+        } else neighptr[n++] = j;
+
+        if (rsq < cut_inner_sq) {
+          if (which == 0) neighptr_inner[n_inner++] = j;
+          else if (minchange) neighptr_inner[n_inner++] = j;
+          else if (which > 0) neighptr_inner[n_inner++] = j ^ (which << SBBITS);
+        }
+
+        if (respamiddle &&
+            rsq < cut_middle_sq && rsq > cut_middle_inside_sq) {
+          if (which == 0) neighptr_middle[n_middle++] = j;
+          else if (minchange) neighptr_middle[n_middle++] = j;
+          else if (which > 0)
+            neighptr_middle[n_middle++] = j ^ (which << SBBITS);
+        }
+      }
+    }
+
+    // loop over all atoms in other bins in stencil, store every pair
+
+    ibin = coord2bin(x[i]);
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (minchange = domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+
+          if (rsq < cut_inner_sq) {
+            if (which == 0) neighptr_inner[n_inner++] = j;
+            else if (minchange) neighptr_inner[n_inner++] = j;
+            else if (which > 0)
+              neighptr_inner[n_inner++] = j ^ (which << SBBITS);
+          }
+
+          if (respamiddle &&
+              rsq < cut_middle_sq && rsq > cut_middle_inside_sq) {
+            if (which == 0) neighptr_middle[n_middle++] = j;
+            else if (minchange) neighptr_middle[n_middle++] = j;
+            else if (which > 0)
+              neighptr_middle[n_middle++] = j ^ (which << SBBITS);
+          }
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    ilist_inner[i] = i;
+    firstneigh_inner[i] = neighptr_inner;
+    numneigh_inner[i] = n_inner;
+    npnt_inner += n_inner;
+    if (n_inner > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    if (respamiddle) {
+      ilist_middle[i] = i;
+      firstneigh_middle[i] = neighptr_middle;
+      numneigh_middle[i] = n_middle;
+      npnt_middle += n_middle;
+      if (n_middle > oneatom)
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  listinner->inum = nlocal;
+  if (respamiddle) listmiddle->inum = nlocal;
+}
+
+/* ----------------------------------------------------------------------
+   multiple respa lists
+   binned neighbor list construction with Newton's 3rd law for triclinic
+   each owned atom i checks its own bin and other bins in triclinic stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::respa_bin_newton_tri_omp(NeighList *list)
+{
+  // bin local & ghost atoms
+
+  bin_atoms();
+
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+
+  NEIGH_OMP_INIT;
+
+  NeighList *listinner = list->listinner;
+  if (nthreads > listinner->maxpage)
+    listinner->add_pages(nthreads - listinner->maxpage);
+
+  NeighList *listmiddle;
+  const int respamiddle = list->respamiddle;
+  if (respamiddle) {
+    listmiddle = list->listmiddle;
+    if (nthreads > listmiddle->maxpage)
+      listmiddle->add_pages(nthreads - listmiddle->maxpage);
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(list,listinner,listmiddle)
+#endif
+  NEIGH_OMP_SETUP(nlocal);
+
+  int i,j,k,n,itype,jtype,ibin,n_inner,n_middle;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq;
+  int *neighptr,*neighptr_inner,*neighptr_middle;
+
+  // loop over each atom, storing neighbors
+
+  int **special = atom->special;
+  int **nspecial = atom->nspecial;
+  int *tag = atom->tag;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int *molecule = atom->molecule;
+  int molecular = atom->molecular;
+
+  int *ilist = list->ilist;
+  int *numneigh = list->numneigh;
+  int **firstneigh = list->firstneigh;
+  int nstencil = list->nstencil;
+  int *stencil = list->stencil;
+
+  int *ilist_inner = listinner->ilist;
+  int *numneigh_inner = listinner->numneigh;
+  int **firstneigh_inner = listinner->firstneigh;
+
+  int *ilist_middle,*numneigh_middle,**firstneigh_middle;
+  if (respamiddle) {
+    ilist_middle = listmiddle->ilist;
+    numneigh_middle = listmiddle->numneigh;
+    firstneigh_middle = listmiddle->firstneigh;
+  }
+
+  int npage = tid;
+  int npnt = 0;
+  int npage_inner = tid;
+  int npnt_inner = 0;
+  int npage_middle = tid;
+  int npnt_middle = 0;
+
+  int which = 0;
+  int minchange = 0;
+
+  for (i = ifrom; i < ito; i++) {
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt < oneatom) {
+      npnt = 0;
+      npage += nthreads;
+      if (npage == list->maxpage) list->add_pages(nthreads);
+    }
+    neighptr = &(list->pages[npage][npnt]);
+    n = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (pgsize - npnt_inner < oneatom) {
+      npnt_inner = 0;
+      npage_inner += nthreads;
+      if (npage_inner == listinner->maxpage) listinner->add_pages(nthreads);
+    }
+    neighptr_inner = &(listinner->pages[npage_inner][npnt_inner]);
+    n_inner = 0;
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+    if (respamiddle) {
+      if (pgsize - npnt_middle < oneatom) {
+        npnt_middle = 0;
+        npage_middle += nthreads;
+        if (npage_middle == listmiddle->maxpage) listmiddle->add_pages(nthreads);
+      }
+      neighptr_middle = &(listmiddle->pages[npage_middle][npnt_middle]);
+      n_middle = 0;
+    }
+
+    itype = type[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    // loop over all atoms in bins in stencil
+    // pairs for atoms j "below" i are excluded
+    // below = lower z or (equal z and lower y) or (equal zy and lower x)
+    //         (equal zyx and j <= i)
+    // latter excludes self-self interaction but allows superposed atoms
+
+    ibin = coord2bin(x[i]);
+    for (k = 0; k < nstencil; k++) {
+      for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp) {
+          if (x[j][1] < ytmp) continue;
+          if (x[j][1] == ytmp) {
+            if (x[j][0] < xtmp) continue;
+            if (x[j][0] == xtmp && j <= i) continue;
+          }
+        }
+
+        jtype = type[j];
+        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq <= cutneighsq[itype][jtype]) {
+          if (molecular) {
+            which = find_special(special[i],nspecial[i],tag[j]);
+            if (which == 0) neighptr[n++] = j;
+            else if (minchange = domain->minimum_image_check(delx,dely,delz))
+              neighptr[n++] = j;
+            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+          } else neighptr[n++] = j;
+
+          if (rsq < cut_inner_sq) {
+            if (which == 0) neighptr_inner[n_inner++] = j;
+            else if (minchange) neighptr_inner[n_inner++] = j;
+            else if (which > 0)
+              neighptr_inner[n_inner++] = j ^ (which << SBBITS);
+          }
+
+          if (respamiddle &&
+              rsq < cut_middle_sq && rsq > cut_middle_inside_sq) {
+            if (which == 0) neighptr_middle[n_middle++] = j;
+            else if (minchange) neighptr_middle[n_middle++] = j;
+            else if (which > 0)
+              neighptr_middle[n_middle++] = j ^ (which << SBBITS);
+          }
+        }
+      }
+    }
+
+    ilist[i] = i;
+    firstneigh[i] = neighptr;
+    numneigh[i] = n;
+    npnt += n;
+    if (n > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    ilist_inner[i] = i;
+    firstneigh_inner[i] = neighptr_inner;
+    numneigh_inner[i] = n_inner;
+    npnt_inner += n_inner;
+    if (n_inner > oneatom)
+      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+
+    if (respamiddle) {
+      ilist_middle[i] = i;
+      firstneigh_middle[i] = neighptr_middle;
+      numneigh_middle[i] = n_middle;
+      npnt_middle += n_middle;
+      if (n_middle > oneatom)
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  NEIGH_OMP_CLOSE;
+  list->inum = nlocal;
+  listinner->inum = nlocal;
+  if (respamiddle) listmiddle->inum = nlocal;
+}
diff --git a/src/USER-OMP/neighbor_omp.h b/src/USER-OMP/neighbor_omp.h
new file mode 100644
index 0000000000..fd27ba8b61
--- /dev/null
+++ b/src/USER-OMP/neighbor_omp.h
@@ -0,0 +1,59 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_NEIGHBOR_OMP_H
+#define LMP_NEIGHBOR_OMP_H
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace LAMMPS_NS {
+
+// these macros hide some ugly and redundant OpenMP related stuff
+#if defined(_OPENMP)
+
+// make sure we have at least one page for each thread
+#define NEIGH_OMP_INIT                                        \
+  const int nthreads = comm->nthreads;                        \
+  if (nthreads > list->maxpage)                                \
+    list->add_pages(nthreads - list->maxpage)
+
+// get thread id and then assign each thread a fixed chunk of atoms
+#define NEIGH_OMP_SETUP(num)                                \
+  {                                                        \
+    const int tid = omp_get_thread_num();                \
+    const int idelta = 1 + num/nthreads;                \
+    const int ifrom = tid*idelta;                        \
+    const int ito   = ((ifrom + idelta) > num)                \
+      ? num : (ifrom+idelta);                                \
+
+#define NEIGH_OMP_CLOSE }
+
+#else /* !defined(_OPENMP) */
+
+#define NEIGH_OMP_INIT                                        \
+  const int nthreads = comm->nthreads;
+
+#define NEIGH_OMP_SETUP(num)                                \
+    const int tid = 0;                                        \
+    const int ifrom = 0;                                \
+    const int ito = num
+
+#define NEIGH_OMP_CLOSE
+
+#endif
+
+}
+
+#endif
diff --git a/src/USER-OMP/pair_adp_omp.cpp b/src/USER-OMP/pair_adp_omp.cpp
new file mode 100644
index 0000000000..4d05b16c70
--- /dev/null
+++ b/src/USER-OMP/pair_adp_omp.cpp
@@ -0,0 +1,390 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "string.h"
+
+#include "pair_adp_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairADPOMP::PairADPOMP(LAMMPS *lmp) :
+  PairADP(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairADPOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
+
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // grow energy and fp arrays if necessary
+  // need to be atom->nmax in length
+
+  if (atom->nmax > nmax) {
+    memory->destroy(rho);
+    memory->destroy(fp);
+    memory->destroy(mu);
+    memory->destroy(lambda);
+    nmax = atom->nmax;
+    memory->create(rho,nthreads*nmax,"pair:rho");
+    memory->create(fp,nmax,"pair:fp");
+    memory->create(mu,nthreads*nmax,3,"pair:mu");
+    memory->create(lambda,nthreads*nmax,6,"pair:lambda");
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (force->newton_pair)
+      thr->init_adp(nall, rho, mu, lambda);
+    else
+      thr->init_adp(nlocal, rho, mu, lambda);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairADPOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,m,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi;
+  double u2,u2p,w2,w2p,nu;
+  double *coeff;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double delmux,delmuy,delmuz,trdelmu,tradellam;
+  double adpx,adpy,adpz,fx,fy,fz;
+  double sumlamxx,sumlamyy,sumlamzz,sumlamyz,sumlamxz,sumlamxy;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const rho_t = thr->get_rho();
+  double * const * const mu_t = thr->get_mu();
+  double * const * const lambda_t = thr->get_lambda();
+  const int tid = thr->get_tid();
+
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        jtype = type[j];
+        p = sqrt(rsq)*rdr + 1.0;
+        m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+        coeff = rhor_spline[type2rhor[jtype][itype]][m];
+        rho_t[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        coeff = u2r_spline[type2u2r[jtype][itype]][m];
+        u2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        mu_t[i][0] += u2*delx;
+        mu_t[i][1] += u2*dely;
+        mu_t[i][2] += u2*delz;
+        coeff = w2r_spline[type2w2r[jtype][itype]][m];
+        w2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        lambda_t[i][0] += w2*delx*delx;
+        lambda_t[i][1] += w2*dely*dely;
+        lambda_t[i][2] += w2*delz*delz;
+        lambda_t[i][3] += w2*dely*delz;
+        lambda_t[i][4] += w2*delx*delz;
+        lambda_t[i][5] += w2*delx*dely;
+
+        if (NEWTON_PAIR || j < nlocal) {
+          // verify sign difference for mu and lambda
+          coeff = rhor_spline[type2rhor[itype][jtype]][m];
+          rho_t[j] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+          coeff = u2r_spline[type2u2r[itype][jtype]][m];
+          u2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+          mu_t[j][0] -= u2*delx;
+          mu_t[j][1] -= u2*dely;
+          mu_t[j][2] -= u2*delz;
+          coeff = w2r_spline[type2w2r[itype][jtype]][m];
+          w2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+          lambda_t[j][0] += w2*delx*delx;
+          lambda_t[j][1] += w2*dely*dely;
+          lambda_t[j][2] += w2*delz*delz;
+          lambda_t[j][3] += w2*dely*delz;
+          lambda_t[j][4] += w2*delx*delz;
+          lambda_t[j][5] += w2*delx*dely;
+        }
+      }
+    }
+  }
+
+  // wait until all threads are done with computation
+  sync_threads();
+
+  // communicate and sum densities
+
+  if (NEWTON_PAIR) {
+    // reduce per thread density
+    data_reduce_thr(&(rho[0]), nall, comm->nthreads, 1, tid);
+    data_reduce_thr(&(mu[0][0]), nall, comm->nthreads, 3, tid);
+    data_reduce_thr(&(lambda[0][0]), nall, comm->nthreads, 6, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    { comm->reverse_comm_pair(this); }
+
+    // wait until master thread is done with communication
+    sync_threads();
+
+  } else {
+    // reduce per thread density
+    data_reduce_thr(&(rho[0]), nlocal, comm->nthreads, 1, tid);
+    data_reduce_thr(&(mu[0][0]), nlocal, comm->nthreads, 3, tid);
+    data_reduce_thr(&(lambda[0][0]), nlocal, comm->nthreads, 6, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+  }
+
+  // fp = derivative of embedding energy at each atom
+  // phi = embedding energy at each atom
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    p = rho[i]*rdrho + 1.0;
+    m = static_cast<int> (p);
+    m = MAX(1,MIN(m,nrho-1));
+    p -= m;
+    p = MIN(p,1.0);
+    coeff = frho_spline[type2frho[type[i]]][m];
+    fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
+    if (EFLAG) {
+      phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+      phi += 0.5*(mu[i][0]*mu[i][0]+mu[i][1]*mu[i][1]+mu[i][2]*mu[i][2]);
+      phi += 0.5*(lambda[i][0]*lambda[i][0]+lambda[i][1]*
+                  lambda[i][1]+lambda[i][2]*lambda[i][2]);
+      phi += 1.0*(lambda[i][3]*lambda[i][3]+lambda[i][4]*
+                  lambda[i][4]+lambda[i][5]*lambda[i][5]);
+      phi -= 1.0/6.0*(lambda[i][0]+lambda[i][1]+lambda[i][2])*
+        (lambda[i][0]+lambda[i][1]+lambda[i][2]);
+      e_tally_thr(this,i,i,nlocal,/* newton_pair */ 1, phi, 0.0, thr);
+    }
+  }
+
+  // wait until all theads are done with computation
+  sync_threads();
+
+  // communicate derivative of embedding function
+  // MPI communication only on master thread
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+  { comm->forward_comm_pair(this); }
+
+  // wait until master thread is done with communication
+  sync_threads();
+
+  // compute forces on each atom
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        jtype = type[j];
+        r = sqrt(rsq);
+        p = r*rdr + 1.0;
+        m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+
+        // rhoip = derivative of (density at atom j due to atom i)
+        // rhojp = derivative of (density at atom i due to atom j)
+        // phi = pair potential energy
+        // phip = phi'
+        // z2 = phi * r
+        // z2p = (phi * r)' = (phi' r) + phi
+        // u2 = u
+        // u2p = u'
+        // w2 = w
+        // w2p = w'
+        // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+        //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+        //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+        coeff = rhor_spline[type2rhor[itype][jtype]][m];
+        rhoip = (coeff[0]*p + coeff[1])*p + coeff[2];
+        coeff = rhor_spline[type2rhor[jtype][itype]][m];
+        rhojp = (coeff[0]*p + coeff[1])*p + coeff[2];
+        coeff = z2r_spline[type2z2r[itype][jtype]][m];
+        z2p = (coeff[0]*p + coeff[1])*p + coeff[2];
+        z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        coeff = u2r_spline[type2u2r[itype][jtype]][m];
+        u2p = (coeff[0]*p + coeff[1])*p + coeff[2];
+        u2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        coeff = w2r_spline[type2w2r[itype][jtype]][m];
+        w2p = (coeff[0]*p + coeff[1])*p + coeff[2];
+        w2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+
+        recip = 1.0/r;
+        phi = z2*recip;
+        phip = z2p*recip - phi*recip;
+        psip = fp[i]*rhojp + fp[j]*rhoip + phip;
+        fpair = -psip*recip;
+
+        delmux = mu[i][0]-mu[j][0];
+        delmuy = mu[i][1]-mu[j][1];
+        delmuz = mu[i][2]-mu[j][2];
+        trdelmu = delmux*delx+delmuy*dely+delmuz*delz;
+        sumlamxx = lambda[i][0]+lambda[j][0];
+        sumlamyy = lambda[i][1]+lambda[j][1];
+        sumlamzz = lambda[i][2]+lambda[j][2];
+        sumlamyz = lambda[i][3]+lambda[j][3];
+        sumlamxz = lambda[i][4]+lambda[j][4];
+        sumlamxy = lambda[i][5]+lambda[j][5];
+        tradellam = sumlamxx*delx*delx+sumlamyy*dely*dely+
+          sumlamzz*delz*delz+2.0*sumlamxy*delx*dely+
+          2.0*sumlamxz*delx*delz+2.0*sumlamyz*dely*delz;
+        nu = sumlamxx+sumlamyy+sumlamzz;
+
+        adpx = delmux*u2 + trdelmu*u2p*delx*recip +
+          2.0*w2*(sumlamxx*delx+sumlamxy*dely+sumlamxz*delz) +
+          w2p*delx*recip*tradellam - 1.0/3.0*nu*(w2p*r+2.0*w2)*delx;
+        adpy = delmuy*u2 + trdelmu*u2p*dely*recip +
+          2.0*w2*(sumlamxy*delx+sumlamyy*dely+sumlamyz*delz) +
+          w2p*dely*recip*tradellam - 1.0/3.0*nu*(w2p*r+2.0*w2)*dely;
+        adpz = delmuz*u2 + trdelmu*u2p*delz*recip +
+          2.0*w2*(sumlamxz*delx+sumlamyz*dely+sumlamzz*delz) +
+          w2p*delz*recip*tradellam - 1.0/3.0*nu*(w2p*r+2.0*w2)*delz;
+        adpx*=-1.0; adpy*=-1.0; adpz*=-1.0;
+
+        fx = delx*fpair+adpx;
+        fy = dely*fpair+adpy;
+        fz = delz*fpair+adpz;
+
+        fxtmp += fx;
+        fytmp += fy;
+        fztmp += fz;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= fx;
+          f[j][1] -= fy;
+          f[j][2] -= fz;
+        }
+
+        if (EFLAG) evdwl = phi;
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,evdwl,0.0,
+                                     fx,fy,fz,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairADPOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairADP::memory_usage();
+  bytes += (comm->nthreads-1) * nmax * (10*sizeof(double) + 3*sizeof(double *));
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_adp_omp.h b/src/USER-OMP/pair_adp_omp.h
new file mode 100644
index 0000000000..e5c4c4af9f
--- /dev/null
+++ b/src/USER-OMP/pair_adp_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(adp/omp,PairADPOMP)
+
+#else
+
+#ifndef LMP_PAIR_ADP_OMP_H
+#define LMP_PAIR_ADP_OMP_H
+
+#include "pair_adp.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairADPOMP : public PairADP, public ThrOMP {
+
+ public:
+  PairADPOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int iifrom, int iito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_airebo_omp.cpp b/src/USER-OMP/pair_airebo_omp.cpp
new file mode 100644
index 0000000000..5000ea2a09
--- /dev/null
+++ b/src/USER-OMP/pair_airebo_omp.cpp
@@ -0,0 +1,2767 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_airebo_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define TOL 1.0e-9
+
+/* ---------------------------------------------------------------------- */
+
+PairAIREBOOMP::PairAIREBOOMP(LAMMPS *lmp) :
+  PairAIREBO(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairAIREBOOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = vflag_atom = 0;
+
+  REBO_neigh_thr();
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    FREBO_thr(ifrom,ito,evflag,eflag,vflag_atom,thr);
+    if (ljflag) FLJ_thr(ifrom,ito,evflag,eflag,vflag_atom,thr);
+    if (torflag) TORSION_thr(ifrom,ito,evflag,eflag,thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ----------------------------------------------------------------------
+   Bij function
+------------------------------------------------------------------------- */
+
+double PairAIREBOOMP::bondorder_thr(int i, int j, double rij[3], double rijmag,
+                                    double VA, int vflag_atom, ThrData * const thr)
+{
+  int atomi,atomj,k,n,l,atomk,atoml,atomn,atom1,atom2,atom3,atom4;
+  int itype,jtype,ktype,ltype,ntype;
+  double rik[3],rjl[3],rkn[3],rji[3],rki[3],rlj[3],rknmag,dNki,dwjl,bij;
+  double NijC,NijH,NjiC,NjiH,wik,dwik,dwkn,wjl;
+  double rikmag,rjlmag,cosjik,cosijl,g,tmp2,tmp3;
+  double Etmp,pij,tmp,wij,dwij,NconjtmpI,NconjtmpJ,Nki,Nlj,dS;
+  double lamdajik,lamdaijl,dgdc,dgdN,pji,Nijconj,piRC;
+  double dcosjikdri[3],dcosijldri[3],dcosjikdrk[3];
+  double dN2[2],dN3[3];
+  double dcosjikdrj[3],dcosijldrj[3],dcosijldrl[3];
+  double Tij;
+  double r32[3],r32mag,cos321,r43[3],r13[3];
+  double dNlj;
+  double om1234,rln[3];
+  double rlnmag,dwln,r23[3],r23mag,r21[3],r21mag;
+  double w21,dw21,r34[3],r34mag,cos234,w34,dw34;
+  double cross321[3],cross234[3],prefactor,SpN;
+  double fcijpc,fcikpc,fcjlpc,fcjkpc,fcilpc;
+  double dt2dik[3],dt2djl[3],dt2dij[3],aa,aaa1,aaa2,at2,cw,cwnum,cwnom;
+  double sin321,sin234,rr,rijrik,rijrjl,rjk2,rik2,ril2,rjl2;
+  double dctik,dctjk,dctjl,dctij,dctji,dctil,rik2i,rjl2i,sink2i,sinl2i;
+  double rjk[3],ril[3],dt1dik,dt1djk,dt1djl,dt1dil,dt1dij;
+  double F23[3],F12[3],F34[3],F31[3],F24[3],fi[3],fj[3],fk[3],fl[3];
+  double f1[3],f2[3],f3[3],f4[4];
+  double dcut321,PijS,PjiS;
+  double rij2,tspjik,dtsjik,tspijl,dtsijl,costmp;
+  int *REBO_neighs,*REBO_neighs_i,*REBO_neighs_j,*REBO_neighs_k,*REBO_neighs_l;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int *type = atom->type;
+
+  atomi = i;
+  atomj = j;
+  itype = map[type[i]];
+  jtype = map[type[j]];
+  wij = Sp(rijmag,rcmin[itype][jtype],rcmax[itype][jtype],dwij);
+  NijC = nC[i]-(wij*kronecker(jtype,0));
+  NijH = nH[i]-(wij*kronecker(jtype,1));
+  NjiC = nC[j]-(wij*kronecker(itype,0));
+  NjiH = nH[j]-(wij*kronecker(itype,1));
+  bij = 0.0;
+  tmp = 0.0;
+  tmp2 = 0.0;
+  tmp3 = 0.0;
+  dgdc = 0.0;
+  dgdN = 0.0;
+  NconjtmpI = 0.0;
+  NconjtmpJ = 0.0;
+  Etmp = 0.0;
+
+  REBO_neighs = REBO_firstneigh[i];
+  for (k = 0; k < REBO_numneigh[i]; k++) {
+    atomk = REBO_neighs[k];
+    if (atomk != atomj) {
+      ktype = map[type[atomk]];
+      rik[0] = x[atomi][0]-x[atomk][0];
+      rik[1] = x[atomi][1]-x[atomk][1];
+      rik[2] = x[atomi][2]-x[atomk][2];
+      rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+      lamdajik = 4.0*kronecker(itype,1) *
+        ((rho[ktype][1]-rikmag)-(rho[jtype][1]-rijmag));
+      wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dS);
+      Nki = nC[atomk]-(wik*kronecker(itype,0))+nH[atomk] -
+        (wik*kronecker(itype,1));
+      cosjik = ((rij[0]*rik[0])+(rij[1]*rik[1])+(rij[2]*rik[2])) /
+        (rijmag*rikmag);
+      cosjik = MIN(cosjik,1.0);
+      cosjik = MAX(cosjik,-1.0);
+
+      // evaluate splines g and derivatives dg
+
+      g = gSpline(cosjik,(NijC+NijH),itype,&dgdc,&dgdN);
+      Etmp = Etmp+(wik*g*exp(lamdajik));
+      tmp3 = tmp3+(wik*dgdN*exp(lamdajik));
+      NconjtmpI = NconjtmpI+(kronecker(ktype,0)*wik*Sp(Nki,Nmin,Nmax,dS));
+    }
+  }
+
+  PijS = 0.0;
+  dN2[0] = 0.0;
+  dN2[1] = 0.0;
+  PijS = PijSpline(NijC,NijH,itype,jtype,dN2);
+  pij = 1.0/sqrt(1.0+Etmp+PijS);
+  tmp = -0.5*pij*pij*pij;
+
+  const double invrijm = 1.0/rijmag;
+  const double invrijm2 = invrijm*invrijm;
+
+  // pij forces
+
+  REBO_neighs = REBO_firstneigh[i];
+  for (k = 0; k < REBO_numneigh[i]; k++) {
+    atomk = REBO_neighs[k];
+    if (atomk != atomj) {
+      ktype = map[type[atomk]];
+      rik[0] = x[atomi][0]-x[atomk][0];
+      rik[1] = x[atomi][1]-x[atomk][1];
+      rik[2] = x[atomi][2]-x[atomk][2];
+      rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+      lamdajik = 4.0*kronecker(itype,1) *
+        ((rho[ktype][1]-rikmag)-(rho[jtype][1]-rijmag));
+      wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dwik);
+
+      const double invrikm = 1.0/rikmag;
+      const double invrijkm = invrijm*invrikm;
+      const double invrikm2 = invrikm*invrikm;
+
+      cosjik = (rij[0]*rik[0] + rij[1]*rik[1] + rij[2]*rik[2]) * invrijkm;
+      cosjik = MIN(cosjik,1.0);
+      cosjik = MAX(cosjik,-1.0);
+
+      dcosjikdri[0] = ((rij[0]+rik[0])*invrijkm) -
+        (cosjik*((rij[0]*invrijm2)+(rik[0]*invrikm2)));
+      dcosjikdri[1] = ((rij[1]+rik[1])*invrijkm) -
+        (cosjik*((rij[1]*invrijm2)+(rik[1]*invrikm2)));
+      dcosjikdri[2] = ((rij[2]+rik[2])*invrijkm) -
+        (cosjik*((rij[2]*invrijm2)+(rik[2]*invrikm2)));
+      dcosjikdrk[0] = (-rij[0]*invrijkm) + (cosjik*(rik[0]*invrikm2));
+      dcosjikdrk[1] = (-rij[1]*invrijkm) + (cosjik*(rik[1]*invrikm2));
+      dcosjikdrk[2] = (-rij[2]*invrijkm) + (cosjik*(rik[2]*invrikm2));
+      dcosjikdrj[0] = (-rik[0]*invrijkm) + (cosjik*(rij[0]*invrijm2));
+      dcosjikdrj[1] = (-rik[1]*invrijkm) + (cosjik*(rij[1]*invrijm2));
+      dcosjikdrj[2] = (-rik[2]*invrijkm) + (cosjik*(rij[2]*invrijm2));
+
+      g = gSpline(cosjik,(NijC+NijH),itype,&dgdc,&dgdN);
+      tmp2 = VA*.5*(tmp*wik*dgdc*exp(lamdajik));
+      fj[0] = -tmp2*dcosjikdrj[0];
+      fj[1] = -tmp2*dcosjikdrj[1];
+      fj[2] = -tmp2*dcosjikdrj[2];
+      fi[0] = -tmp2*dcosjikdri[0];
+      fi[1] = -tmp2*dcosjikdri[1];
+      fi[2] = -tmp2*dcosjikdri[2];
+      fk[0] = -tmp2*dcosjikdrk[0];
+      fk[1] = -tmp2*dcosjikdrk[1];
+      fk[2] = -tmp2*dcosjikdrk[2];
+
+      tmp2 = VA*.5*(tmp*wik*g*exp(lamdajik)*4.0*kronecker(itype,1));
+      fj[0] -= tmp2*(-rij[0]*invrijm);
+      fj[1] -= tmp2*(-rij[1]*invrijm);
+      fj[2] -= tmp2*(-rij[2]*invrijm);
+      fi[0] -= tmp2*((-rik[0]*invrikm)+(rij[0]*invrijm));
+      fi[1] -= tmp2*((-rik[1]*invrikm)+(rij[1]*invrijm));
+      fi[2] -= tmp2*((-rik[2]*invrikm)+(rij[2]*invrijm));
+      fk[0] -= tmp2*(rik[0]*invrikm);
+      fk[1] -= tmp2*(rik[1]*invrikm);
+      fk[2] -= tmp2*(rik[2]*invrikm);
+
+      // coordination forces
+
+      // dwik forces
+
+      tmp2 = VA*.5*(tmp*dwik*g*exp(lamdajik))*invrikm;
+      fi[0] -= tmp2*rik[0];
+      fi[1] -= tmp2*rik[1];
+      fi[2] -= tmp2*rik[2];
+      fk[0] += tmp2*rik[0];
+      fk[1] += tmp2*rik[1];
+      fk[2] += tmp2*rik[2];
+
+      // PIJ forces
+
+      tmp2 = VA*.5*(tmp*dN2[ktype]*dwik)*invrikm;
+      fi[0] -= tmp2*rik[0];
+      fi[1] -= tmp2*rik[1];
+      fi[2] -= tmp2*rik[2];
+      fk[0] += tmp2*rik[0];
+      fk[1] += tmp2*rik[1];
+      fk[2] += tmp2*rik[2];
+
+      // dgdN forces
+
+      tmp2 = VA*.5*(tmp*tmp3*dwik)*invrikm;
+      fi[0] -= tmp2*rik[0];
+      fi[1] -= tmp2*rik[1];
+      fi[2] -= tmp2*rik[2];
+      fk[0] += tmp2*rik[0];
+      fk[1] += tmp2*rik[1];
+      fk[2] += tmp2*rik[2];
+
+      f[atomi][0] += fi[0]; f[atomi][1] += fi[1]; f[atomi][2] += fi[2];
+      f[atomj][0] += fj[0]; f[atomj][1] += fj[1]; f[atomj][2] += fj[2];
+      f[atomk][0] += fk[0]; f[atomk][1] += fk[1]; f[atomk][2] += fk[2];
+
+      if (vflag_atom) {
+        rji[0] = -rij[0]; rji[1] = -rij[1]; rji[2] = -rij[2];
+        rki[0] = -rik[0]; rki[1] = -rik[1]; rki[2] = -rik[2];
+        v_tally3_thr(atomi,atomj,atomk,fj,fk,rji,rki,thr);
+      }
+    }
+  }
+
+  tmp = 0.0;
+  tmp2 = 0.0;
+  tmp3 = 0.0;
+  Etmp = 0.0;
+
+  REBO_neighs = REBO_firstneigh[j];
+  for (l = 0; l < REBO_numneigh[j]; l++) {
+    atoml = REBO_neighs[l];
+    if (atoml != atomi) {
+      ltype = map[type[atoml]];
+      rjl[0] = x[atomj][0]-x[atoml][0];
+      rjl[1] = x[atomj][1]-x[atoml][1];
+      rjl[2] = x[atomj][2]-x[atoml][2];
+      rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+      lamdaijl = 4.0*kronecker(jtype,1) *
+        ((rho[ltype][1]-rjlmag)-(rho[itype][1]-rijmag));
+      wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dS);
+      Nlj = nC[atoml]-(wjl*kronecker(jtype,0)) +
+        nH[atoml]-(wjl*kronecker(jtype,1));
+      cosijl = -1.0*((rij[0]*rjl[0])+(rij[1]*rjl[1])+(rij[2]*rjl[2])) /
+        (rijmag*rjlmag);
+      cosijl = MIN(cosijl,1.0);
+      cosijl = MAX(cosijl,-1.0);
+
+      // evaluate splines g and derivatives dg
+
+      g = gSpline(cosijl,NjiC+NjiH,jtype,&dgdc,&dgdN);
+      Etmp = Etmp+(wjl*g*exp(lamdaijl));
+      tmp3 = tmp3+(wjl*dgdN*exp(lamdaijl));
+      NconjtmpJ = NconjtmpJ+(kronecker(ltype,0)*wjl*Sp(Nlj,Nmin,Nmax,dS));
+    }
+  }
+
+  PjiS = 0.0;
+  dN2[0] = 0.0;
+  dN2[1] = 0.0;
+  PjiS = PijSpline(NjiC,NjiH,jtype,itype,dN2);
+  pji = 1.0/sqrt(1.0+Etmp+PjiS);
+  tmp = -0.5*pji*pji*pji;
+
+  REBO_neighs = REBO_firstneigh[j];
+  for (l = 0; l < REBO_numneigh[j]; l++) {
+    atoml = REBO_neighs[l];
+    if (atoml != atomi) {
+      ltype = map[type[atoml]];
+      rjl[0] = x[atomj][0]-x[atoml][0];
+      rjl[1] = x[atomj][1]-x[atoml][1];
+      rjl[2] = x[atomj][2]-x[atoml][2];
+      rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+      lamdaijl = 4.0*kronecker(jtype,1) *
+        ((rho[ltype][1]-rjlmag)-(rho[itype][1]-rijmag));
+      wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dwjl);
+
+      const double invrjlm = 1.0/rjlmag;
+      const double invrijlm = invrijm*invrjlm;
+      const double invrjlm2 = invrjlm*invrjlm;
+
+      cosijl = (-1.0*((rij[0]*rjl[0])+(rij[1]*rjl[1])+(rij[2]*rjl[2])))
+        * invrijlm;
+
+      cosijl = MIN(cosijl,1.0);
+      cosijl = MAX(cosijl,-1.0);
+
+      dcosijldri[0] = (-rjl[0]*invrijlm) - (cosijl*rij[0]*invrijm2);
+      dcosijldri[1] = (-rjl[1]*invrijlm) - (cosijl*rij[1]*invrijm2);
+      dcosijldri[2] = (-rjl[2]*invrijlm) - (cosijl*rij[2]*invrijm2);
+      dcosijldrj[0] = ((-rij[0]+rjl[0])*invrijlm) +
+        (cosijl*((rij[0]*invrijm2)-(rjl[0]*invrjlm2)));
+      dcosijldrj[1] = ((-rij[1]+rjl[1])*invrijlm) +
+        (cosijl*((rij[1]*invrijm2)-(rjl[1]*invrjlm2)));
+      dcosijldrj[2] = ((-rij[2]+rjl[2])*invrijlm) +
+        (cosijl*((rij[2]*invrijm2)-(rjl[2]*invrjlm2)));
+      dcosijldrl[0] = (rij[0]*invrijlm)+(cosijl*rjl[0]*invrjlm2);
+      dcosijldrl[1] = (rij[1]*invrijlm)+(cosijl*rjl[1]*invrjlm2);
+      dcosijldrl[2] = (rij[2]*invrijlm)+(cosijl*rjl[2]*invrjlm2);
+
+      // evaluate splines g and derivatives dg
+
+      g = gSpline(cosijl,NjiC+NjiH,jtype,&dgdc,&dgdN);
+      tmp2 = VA*.5*(tmp*wjl*dgdc*exp(lamdaijl));
+      fi[0] = -tmp2*dcosijldri[0];
+      fi[1] = -tmp2*dcosijldri[1];
+      fi[2] = -tmp2*dcosijldri[2];
+      fj[0] = -tmp2*dcosijldrj[0];
+      fj[1] = -tmp2*dcosijldrj[1];
+      fj[2] = -tmp2*dcosijldrj[2];
+      fl[0] = -tmp2*dcosijldrl[0];
+      fl[1] = -tmp2*dcosijldrl[1];
+      fl[2] = -tmp2*dcosijldrl[2];
+
+      tmp2 = VA*.5*(tmp*wjl*g*exp(lamdaijl)*4.0*kronecker(jtype,1));
+      fi[0] -= tmp2*(rij[0]*invrijm);
+      fi[1] -= tmp2*(rij[1]*invrijm);
+      fi[2] -= tmp2*(rij[2]*invrijm);
+      fj[0] -= tmp2*((-rjl[0]*invrjlm)-(rij[0]*invrijm));
+      fj[1] -= tmp2*((-rjl[1]*invrjlm)-(rij[1]*invrijm));
+      fj[2] -= tmp2*((-rjl[2]*invrjlm)-(rij[2]*invrijm));
+      fl[0] -= tmp2*(rjl[0]*invrjlm);
+      fl[1] -= tmp2*(rjl[1]*invrjlm);
+      fl[2] -= tmp2*(rjl[2]*invrjlm);
+
+      // coordination forces
+
+      // dwik forces
+
+      tmp2 = VA*.5*(tmp*dwjl*g*exp(lamdaijl))*invrjlm;
+      fj[0] -= tmp2*rjl[0];
+      fj[1] -= tmp2*rjl[1];
+      fj[2] -= tmp2*rjl[2];
+      fl[0] += tmp2*rjl[0];
+      fl[1] += tmp2*rjl[1];
+      fl[2] += tmp2*rjl[2];
+
+      // PIJ forces
+
+      tmp2 = VA*.5*(tmp*dN2[ltype]*dwjl)*invrjlm;
+      fj[0] -= tmp2*rjl[0];
+      fj[1] -= tmp2*rjl[1];
+      fj[2] -= tmp2*rjl[2];
+      fl[0] += tmp2*rjl[0];
+      fl[1] += tmp2*rjl[1];
+      fl[2] += tmp2*rjl[2];
+
+      // dgdN forces
+
+      tmp2 = VA*.5*(tmp*tmp3*dwjl)*invrjlm;
+      fj[0] -= tmp2*rjl[0];
+      fj[1] -= tmp2*rjl[1];
+      fj[2] -= tmp2*rjl[2];
+      fl[0] += tmp2*rjl[0];
+      fl[1] += tmp2*rjl[1];
+      fl[2] += tmp2*rjl[2];
+
+      f[atomi][0] += fi[0]; f[atomi][1] += fi[1]; f[atomi][2] += fi[2];
+      f[atomj][0] += fj[0]; f[atomj][1] += fj[1]; f[atomj][2] += fj[2];
+      f[atoml][0] += fl[0]; f[atoml][1] += fl[1]; f[atoml][2] += fl[2];
+
+      if (vflag_atom) {
+        rlj[0] = -rjl[0]; rlj[1] = -rjl[1]; rlj[2] = -rjl[2];
+        v_tally3_thr(atomi,atomj,atoml,fi,fl,rij,rlj,thr);
+      }
+    }
+  }
+
+  // evaluate Nij conj
+
+  Nijconj = 1.0+(NconjtmpI*NconjtmpI)+(NconjtmpJ*NconjtmpJ);
+  piRC = piRCSpline(NijC+NijH,NjiC+NjiH,Nijconj,itype,jtype,dN3);
+
+  // piRC forces
+
+  REBO_neighs_i = REBO_firstneigh[i];
+  for (k = 0; k < REBO_numneigh[i]; k++) {
+    atomk = REBO_neighs_i[k];
+    if (atomk !=atomj) {
+      ktype = map[type[atomk]];
+      rik[0] = x[atomi][0]-x[atomk][0];
+      rik[1] = x[atomi][1]-x[atomk][1];
+      rik[2] = x[atomi][2]-x[atomk][2];
+      rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+      wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dwik);
+      Nki = nC[atomk]-(wik*kronecker(itype,0))+nH[atomk] -
+        (wik*kronecker(itype,1));
+      SpN = Sp(Nki,Nmin,Nmax,dNki);
+
+      tmp2 = VA*dN3[0]*dwik/rikmag;
+      f[atomi][0] -= tmp2*rik[0];
+      f[atomi][1] -= tmp2*rik[1];
+      f[atomi][2] -= tmp2*rik[2];
+      f[atomk][0] += tmp2*rik[0];
+      f[atomk][1] += tmp2*rik[1];
+      f[atomk][2] += tmp2*rik[2];
+
+      if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+      tmp2 = VA*dN3[2]*(2.0*NconjtmpI*dwik*SpN)/rikmag;
+      f[atomi][0] -= tmp2*rik[0];
+      f[atomi][1] -= tmp2*rik[1];
+      f[atomi][2] -= tmp2*rik[2];
+      f[atomk][0] += tmp2*rik[0];
+      f[atomk][1] += tmp2*rik[1];
+      f[atomk][2] += tmp2*rik[2];
+
+      if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+      if (fabs(dNki) > TOL) {
+        REBO_neighs_k = REBO_firstneigh[atomk];
+        for (n = 0; n < REBO_numneigh[atomk]; n++) {
+          atomn = REBO_neighs_k[n];
+          if (atomn != atomi) {
+            ntype = map[type[atomn]];
+            rkn[0] = x[atomk][0]-x[atomn][0];
+            rkn[1] = x[atomk][1]-x[atomn][1];
+            rkn[2] = x[atomk][2]-x[atomn][2];
+            rknmag = sqrt((rkn[0]*rkn[0])+(rkn[1]*rkn[1])+(rkn[2]*rkn[2]));
+            Sp(rknmag,rcmin[ktype][ntype],rcmax[ktype][ntype],dwkn);
+
+            tmp2 = VA*dN3[2]*(2.0*NconjtmpI*wik*dNki*dwkn)/rknmag;
+            f[atomk][0] -= tmp2*rkn[0];
+            f[atomk][1] -= tmp2*rkn[1];
+            f[atomk][2] -= tmp2*rkn[2];
+            f[atomn][0] += tmp2*rkn[0];
+            f[atomn][1] += tmp2*rkn[1];
+            f[atomn][2] += tmp2*rkn[2];
+
+            if (vflag_atom) v_tally2_thr(atomk,atomn,-tmp2,rkn,thr);
+          }
+        }
+      }
+    }
+  }
+
+  // piRC forces
+
+  REBO_neighs = REBO_firstneigh[atomj];
+  for (l = 0; l < REBO_numneigh[atomj]; l++) {
+    atoml = REBO_neighs[l];
+    if (atoml !=atomi) {
+      ltype = map[type[atoml]];
+      rjl[0] = x[atomj][0]-x[atoml][0];
+      rjl[1] = x[atomj][1]-x[atoml][1];
+      rjl[2] = x[atomj][2]-x[atoml][2];
+      rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+      wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dwjl);
+      Nlj = nC[atoml]-(wjl*kronecker(jtype,0))+nH[atoml] -
+        (wjl*kronecker(jtype,1));
+      SpN = Sp(Nlj,Nmin,Nmax,dNlj);
+
+      tmp2 = VA*dN3[1]*dwjl/rjlmag;
+      f[atomj][0] -= tmp2*rjl[0];
+      f[atomj][1] -= tmp2*rjl[1];
+      f[atomj][2] -= tmp2*rjl[2];
+      f[atoml][0] += tmp2*rjl[0];
+      f[atoml][1] += tmp2*rjl[1];
+      f[atoml][2] += tmp2*rjl[2];
+
+      if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+      tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*dwjl*SpN)/rjlmag;
+      f[atomj][0] -= tmp2*rjl[0];
+      f[atomj][1] -= tmp2*rjl[1];
+      f[atomj][2] -= tmp2*rjl[2];
+      f[atoml][0] += tmp2*rjl[0];
+      f[atoml][1] += tmp2*rjl[1];
+      f[atoml][2] += tmp2*rjl[2];
+
+      if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+      if (fabs(dNlj) > TOL) {
+        REBO_neighs_l = REBO_firstneigh[atoml];
+        for (n = 0; n < REBO_numneigh[atoml]; n++) {
+          atomn = REBO_neighs_l[n];
+          if (atomn != atomj) {
+            ntype = map[type[atomn]];
+            rln[0] = x[atoml][0]-x[atomn][0];
+            rln[1] = x[atoml][1]-x[atomn][1];
+            rln[2] = x[atoml][2]-x[atomn][2];
+            rlnmag = sqrt((rln[0]*rln[0])+(rln[1]*rln[1])+(rln[2]*rln[2]));
+            Sp(rlnmag,rcmin[ltype][ntype],rcmax[ltype][ntype],dwln);
+
+            tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*wjl*dNlj*dwln)/rlnmag;
+            f[atoml][0] -= tmp2*rln[0];
+            f[atoml][1] -= tmp2*rln[1];
+            f[atoml][2] -= tmp2*rln[2];
+            f[atomn][0] += tmp2*rln[0];
+            f[atomn][1] += tmp2*rln[1];
+            f[atomn][2] += tmp2*rln[2];
+
+            if (vflag_atom) v_tally2_thr(atoml,atomn,-tmp2,rln,thr);
+          }
+        }
+      }
+    }
+  }
+
+  Tij = 0.0;
+  dN3[0] = 0.0;
+  dN3[1] = 0.0;
+  dN3[2] = 0.0;
+  if (itype == 0 && jtype == 0)
+    Tij=TijSpline((NijC+NijH),(NjiC+NjiH),Nijconj,dN3);
+  Etmp = 0.0;
+
+  if (fabs(Tij) > TOL) {
+    atom2 = atomi;
+    atom3 = atomj;
+    r32[0] = x[atom3][0]-x[atom2][0];
+    r32[1] = x[atom3][1]-x[atom2][1];
+    r32[2] = x[atom3][2]-x[atom2][2];
+    r32mag = sqrt((r32[0]*r32[0])+(r32[1]*r32[1])+(r32[2]*r32[2]));
+    r23[0] = -r32[0];
+    r23[1] = -r32[1];
+    r23[2] = -r32[2];
+    r23mag = r32mag;
+    REBO_neighs_i = REBO_firstneigh[i];
+    for (k = 0; k < REBO_numneigh[i]; k++) {
+      atomk = REBO_neighs_i[k];
+      atom1 = atomk;
+      ktype = map[type[atomk]];
+      if (atomk != atomj) {
+        r21[0] = x[atom2][0]-x[atom1][0];
+        r21[1] = x[atom2][1]-x[atom1][1];
+        r21[2] = x[atom2][2]-x[atom1][2];
+        r21mag = sqrt(r21[0]*r21[0] + r21[1]*r21[1] + r21[2]*r21[2]);
+        cos321 = -1.0*((r21[0]*r32[0])+(r21[1]*r32[1])+(r21[2]*r32[2])) /
+          (r21mag*r32mag);
+        cos321 = MIN(cos321,1.0);
+        cos321 = MAX(cos321,-1.0);
+        Sp2(cos321,thmin,thmax,dcut321);
+        sin321 = sqrt(1.0 - cos321*cos321);
+        sink2i = 1.0/(sin321*sin321);
+        rik2i = 1.0/(r21mag*r21mag);
+        if (sin321 != 0.0) {
+          rr = (r23mag*r23mag)-(r21mag*r21mag);
+          rjk[0] = r21[0]-r23[0];
+          rjk[1] = r21[1]-r23[1];
+          rjk[2] = r21[2]-r23[2];
+          rjk2 = (rjk[0]*rjk[0])+(rjk[1]*rjk[1])+(rjk[2]*rjk[2]);
+          rijrik = 2.0*r23mag*r21mag;
+          rik2 = r21mag*r21mag;
+          dctik = (-rr+rjk2)/(rijrik*rik2);
+          dctij = (rr+rjk2)/(rijrik*r23mag*r23mag);
+          dctjk = -2.0/rijrik;
+          w21 = Sp(r21mag,rcmin[itype][ktype],rcmaxp[itype][ktype],dw21);
+          rijmag = r32mag;
+          rikmag = r21mag;
+          rij2 = r32mag*r32mag;
+          rik2 = r21mag*r21mag;
+          costmp = 0.5*(rij2+rik2-rjk2)/rijmag/rikmag;
+          tspjik = Sp2(costmp,thmin,thmax,dtsjik);
+          dtsjik = -dtsjik;
+
+          REBO_neighs_j = REBO_firstneigh[j];
+          for (l = 0; l < REBO_numneigh[j]; l++) {
+            atoml = REBO_neighs_j[l];
+            atom4 = atoml;
+            ltype = map[type[atoml]];
+            if (!(atoml == atomi || atoml == atomk)) {
+              r34[0] = x[atom3][0]-x[atom4][0];
+              r34[1] = x[atom3][1]-x[atom4][1];
+              r34[2] = x[atom3][2]-x[atom4][2];
+              r34mag = sqrt((r34[0]*r34[0])+(r34[1]*r34[1])+(r34[2]*r34[2]));
+              cos234 = (r32[0]*r34[0] + r32[1]*r34[1] + r32[2]*r34[2]) /
+                (r32mag*r34mag);
+              cos234 = MIN(cos234,1.0);
+              cos234 = MAX(cos234,-1.0);
+              sin234 = sqrt(1.0 - cos234*cos234);
+              sinl2i = 1.0/(sin234*sin234);
+              rjl2i = 1.0/(r34mag*r34mag);
+
+              if (sin234 != 0.0) {
+                w34 = Sp(r34mag,rcmin[jtype][ltype],rcmaxp[jtype][ltype],dw34);
+                rr = (r23mag*r23mag)-(r34mag*r34mag);
+                ril[0] = r23[0]+r34[0];
+                ril[1] = r23[1]+r34[1];
+                ril[2] = r23[2]+r34[2];
+                ril2 = (ril[0]*ril[0])+(ril[1]*ril[1])+(ril[2]*ril[2]);
+                rijrjl = 2.0*r23mag*r34mag;
+                rjl2 = r34mag*r34mag;
+                dctjl = (-rr+ril2)/(rijrjl*rjl2);
+                dctji = (rr+ril2)/(rijrjl*r23mag*r23mag);
+                dctil = -2.0/rijrjl;
+                rjlmag = r34mag;
+                rjl2 = r34mag*r34mag;
+                costmp = 0.5*(rij2+rjl2-ril2)/rijmag/rjlmag;
+                tspijl = Sp2(costmp,thmin,thmax,dtsijl);
+                dtsijl = -dtsijl;
+                prefactor = VA*Tij;
+
+                cross321[0] = (r32[1]*r21[2])-(r32[2]*r21[1]);
+                cross321[1] = (r32[2]*r21[0])-(r32[0]*r21[2]);
+                cross321[2] = (r32[0]*r21[1])-(r32[1]*r21[0]);
+                cross234[0] = (r23[1]*r34[2])-(r23[2]*r34[1]);
+                cross234[1] = (r23[2]*r34[0])-(r23[0]*r34[2]);
+                cross234[2] = (r23[0]*r34[1])-(r23[1]*r34[0]);
+
+                cwnum = (cross321[0]*cross234[0]) +
+                  (cross321[1]*cross234[1]) + (cross321[2]*cross234[2]);
+                cwnom = r21mag*r34mag*r23mag*r23mag*sin321*sin234;
+                om1234 = cwnum/cwnom;
+                cw = om1234;
+                Etmp += ((1.0-(om1234*om1234))*w21*w34) *
+                  (1.0-tspjik)*(1.0-tspijl);
+
+                dt1dik = (rik2i)-(dctik*sink2i*cos321);
+                dt1djk = (-dctjk*sink2i*cos321);
+                dt1djl = (rjl2i)-(dctjl*sinl2i*cos234);
+                dt1dil = (-dctil*sinl2i*cos234);
+                dt1dij = (2.0/(r23mag*r23mag))-(dctij*sink2i*cos321) -
+                  (dctji*sinl2i*cos234);
+
+                dt2dik[0] = (-r23[2]*cross234[1])+(r23[1]*cross234[2]);
+                dt2dik[1] = (-r23[0]*cross234[2])+(r23[2]*cross234[0]);
+                dt2dik[2] = (-r23[1]*cross234[0])+(r23[0]*cross234[1]);
+
+                dt2djl[0] = (-r23[1]*cross321[2])+(r23[2]*cross321[1]);
+                dt2djl[1] = (-r23[2]*cross321[0])+(r23[0]*cross321[2]);
+                dt2djl[2] = (-r23[0]*cross321[1])+(r23[1]*cross321[0]);
+
+                dt2dij[0] = (r21[2]*cross234[1])-(r34[2]*cross321[1]) -
+                  (r21[1]*cross234[2])+(r34[1]*cross321[2]);
+                dt2dij[1] = (r21[0]*cross234[2])-(r34[0]*cross321[2]) -
+                  (r21[2]*cross234[0])+(r34[2]*cross321[0]);
+                dt2dij[2] = (r21[1]*cross234[0])-(r34[1]*cross321[0]) -
+                  (r21[0]*cross234[1])+(r34[0]*cross321[1]);
+
+                aa = (prefactor*2.0*cw/cwnom)*w21*w34 *
+                  (1.0-tspjik)*(1.0-tspijl);
+                aaa1 = -prefactor*(1.0-(om1234*om1234)) *
+                  (1.0-tspjik)*(1.0-tspijl);
+                aaa2 = aaa1*w21*w34;
+                at2 = aa*cwnum;
+
+                fcijpc = (-dt1dij*at2)+(aaa2*dtsjik*dctij*(1.0-tspijl)) +
+                  (aaa2*dtsijl*dctji*(1.0-tspjik));
+                fcikpc = (-dt1dik*at2)+(aaa2*dtsjik*dctik*(1.0-tspijl));
+                fcjlpc = (-dt1djl*at2)+(aaa2*dtsijl*dctjl*(1.0-tspjik));
+                fcjkpc = (-dt1djk*at2)+(aaa2*dtsjik*dctjk*(1.0-tspijl));
+                fcilpc = (-dt1dil*at2)+(aaa2*dtsijl*dctil*(1.0-tspjik));
+
+                F23[0] = (fcijpc*r23[0])+(aa*dt2dij[0]);
+                F23[1] = (fcijpc*r23[1])+(aa*dt2dij[1]);
+                F23[2] = (fcijpc*r23[2])+(aa*dt2dij[2]);
+
+                F12[0] = (fcikpc*r21[0])+(aa*dt2dik[0]);
+                F12[1] = (fcikpc*r21[1])+(aa*dt2dik[1]);
+                F12[2] = (fcikpc*r21[2])+(aa*dt2dik[2]);
+
+                F34[0] = (fcjlpc*r34[0])+(aa*dt2djl[0]);
+                F34[1] = (fcjlpc*r34[1])+(aa*dt2djl[1]);
+                F34[2] = (fcjlpc*r34[2])+(aa*dt2djl[2]);
+
+                F31[0] = (fcjkpc*rjk[0]);
+                F31[1] = (fcjkpc*rjk[1]);
+                F31[2] = (fcjkpc*rjk[2]);
+
+                F24[0] = (fcilpc*ril[0]);
+                F24[1] = (fcilpc*ril[1]);
+                F24[2] = (fcilpc*ril[2]);
+
+                f1[0] = -F12[0]-F31[0];
+                f1[1] = -F12[1]-F31[1];
+                f1[2] = -F12[2]-F31[2];
+                f2[0] = F23[0]+F12[0]+F24[0];
+                f2[1] = F23[1]+F12[1]+F24[1];
+                f2[2] = F23[2]+F12[2]+F24[2];
+                f3[0] = -F23[0]+F34[0]+F31[0];
+                f3[1] = -F23[1]+F34[1]+F31[1];
+                f3[2] = -F23[2]+F34[2]+F31[2];
+                f4[0] = -F34[0]-F24[0];
+                f4[1] = -F34[1]-F24[1];
+                f4[2] = -F34[2]-F24[2];
+
+                // coordination forces
+
+                tmp2 = VA*Tij*((1.0-(om1234*om1234))) *
+                  (1.0-tspjik)*(1.0-tspijl)*dw21*w34/r21mag;
+                f2[0] -= tmp2*r21[0];
+                f2[1] -= tmp2*r21[1];
+                f2[2] -= tmp2*r21[2];
+                f1[0] += tmp2*r21[0];
+                f1[1] += tmp2*r21[1];
+                f1[2] += tmp2*r21[2];
+
+                tmp2 = VA*Tij*((1.0-(om1234*om1234))) *
+                  (1.0-tspjik)*(1.0-tspijl)*w21*dw34/r34mag;
+                f3[0] -= tmp2*r34[0];
+                f3[1] -= tmp2*r34[1];
+                f3[2] -= tmp2*r34[2];
+                f4[0] += tmp2*r34[0];
+                f4[1] += tmp2*r34[1];
+                f4[2] += tmp2*r34[2];
+
+                f[atom1][0] += f1[0]; f[atom1][1] += f1[1];
+                f[atom1][2] += f1[2];
+                f[atom2][0] += f2[0]; f[atom2][1] += f2[1];
+                f[atom2][2] += f2[2];
+                f[atom3][0] += f3[0]; f[atom3][1] += f3[1];
+                f[atom3][2] += f3[2];
+                f[atom4][0] += f4[0]; f[atom4][1] += f4[1];
+                f[atom4][2] += f4[2];
+
+                if (vflag_atom) {
+                  r13[0] = -rjk[0]; r13[1] = -rjk[1]; r13[2] = -rjk[2];
+                  r43[0] = -r34[0]; r43[1] = -r34[1]; r43[2] = -r34[2];
+                  v_tally4_thr(atom1,atom2,atom3,atom4,f1,f2,f4,r13,r23,r43,thr);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Tij forces now that we have Etmp
+
+    REBO_neighs = REBO_firstneigh[i];
+    for (k = 0; k < REBO_numneigh[i]; k++) {
+      atomk = REBO_neighs[k];
+      if (atomk != atomj) {
+        ktype = map[type[atomk]];
+        rik[0] = x[atomi][0]-x[atomk][0];
+        rik[1] = x[atomi][1]-x[atomk][1];
+        rik[2] = x[atomi][2]-x[atomk][2];
+        rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+        wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dwik);
+        Nki = nC[atomk]-(wik*kronecker(itype,0))+nH[atomk] -
+          (wik*kronecker(itype,1));
+        SpN = Sp(Nki,Nmin,Nmax,dNki);
+
+        tmp2 = VA*dN3[0]*dwik*Etmp/rikmag;
+        f[atomi][0] -= tmp2*rik[0];
+        f[atomi][1] -= tmp2*rik[1];
+        f[atomi][2] -= tmp2*rik[2];
+        f[atomk][0] += tmp2*rik[0];
+        f[atomk][1] += tmp2*rik[1];
+        f[atomk][2] += tmp2*rik[2];
+
+        if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+        tmp2 = VA*dN3[2]*(2.0*NconjtmpI*dwik*SpN)*Etmp/rikmag;
+        f[atomi][0] -= tmp2*rik[0];
+        f[atomi][1] -= tmp2*rik[1];
+        f[atomi][2] -= tmp2*rik[2];
+        f[atomk][0] += tmp2*rik[0];
+        f[atomk][1] += tmp2*rik[1];
+        f[atomk][2] += tmp2*rik[2];
+
+        if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+        if (fabs(dNki) > TOL) {
+          REBO_neighs_k = REBO_firstneigh[atomk];
+          for (n = 0; n < REBO_numneigh[atomk]; n++) {
+            atomn = REBO_neighs_k[n];
+            ntype = map[type[atomn]];
+            if (atomn != atomi) {
+              rkn[0] = x[atomk][0]-x[atomn][0];
+              rkn[1] = x[atomk][1]-x[atomn][1];
+              rkn[2] = x[atomk][2]-x[atomn][2];
+              rknmag = sqrt((rkn[0]*rkn[0])+(rkn[1]*rkn[1])+(rkn[2]*rkn[2]));
+              Sp(rknmag,rcmin[ktype][ntype],rcmax[ktype][ntype],dwkn);
+
+              tmp2 = VA*dN3[2]*(2.0*NconjtmpI*wik*dNki*dwkn)*Etmp/rknmag;
+              f[atomk][0] -= tmp2*rkn[0];
+              f[atomk][1] -= tmp2*rkn[1];
+              f[atomk][2] -= tmp2*rkn[2];
+              f[atomn][0] += tmp2*rkn[0];
+              f[atomn][1] += tmp2*rkn[1];
+              f[atomn][2] += tmp2*rkn[2];
+
+              if (vflag_atom) v_tally2_thr(atomk,atomn,-tmp2,rkn,thr);
+            }
+          }
+        }
+      }
+    }
+
+    // Tij forces
+
+    REBO_neighs = REBO_firstneigh[j];
+    for (l = 0; l < REBO_numneigh[j]; l++) {
+      atoml = REBO_neighs[l];
+      if (atoml != atomi) {
+        ltype = map[type[atoml]];
+        rjl[0] = x[atomj][0]-x[atoml][0];
+        rjl[1] = x[atomj][1]-x[atoml][1];
+        rjl[2] = x[atomj][2]-x[atoml][2];
+        rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+        wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dwjl);
+        Nlj = nC[atoml]-(wjl*kronecker(jtype,0))+nH[atoml] -
+          (wjl*kronecker(jtype,1));
+        SpN = Sp(Nlj,Nmin,Nmax,dNlj);
+
+        tmp2 = VA*dN3[1]*dwjl*Etmp/rjlmag;
+        f[atomj][0] -= tmp2*rjl[0];
+        f[atomj][1] -= tmp2*rjl[1];
+        f[atomj][2] -= tmp2*rjl[2];
+        f[atoml][0] += tmp2*rjl[0];
+        f[atoml][1] += tmp2*rjl[1];
+        f[atoml][2] += tmp2*rjl[2];
+
+        if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+        tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*dwjl*SpN)*Etmp/rjlmag;
+        f[atomj][0] -= tmp2*rjl[0];
+        f[atomj][1] -= tmp2*rjl[1];
+        f[atomj][2] -= tmp2*rjl[2];
+        f[atoml][0] += tmp2*rjl[0];
+        f[atoml][1] += tmp2*rjl[1];
+        f[atoml][2] += tmp2*rjl[2];
+
+        if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+        if (fabs(dNlj) > TOL) {
+          REBO_neighs_l = REBO_firstneigh[atoml];
+          for (n = 0; n < REBO_numneigh[atoml]; n++) {
+            atomn = REBO_neighs_l[n];
+            ntype = map[type[atomn]];
+            if (atomn !=atomj) {
+              rln[0] = x[atoml][0]-x[atomn][0];
+              rln[1] = x[atoml][1]-x[atomn][1];
+              rln[2] = x[atoml][2]-x[atomn][2];
+              rlnmag = sqrt((rln[0]*rln[0])+(rln[1]*rln[1])+(rln[2]*rln[2]));
+              Sp(rlnmag,rcmin[ltype][ntype],rcmax[ltype][ntype],dwln);
+
+              tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*wjl*dNlj*dwln)*Etmp/rlnmag;
+              f[atoml][0] -= tmp2*rln[0];
+              f[atoml][1] -= tmp2*rln[1];
+              f[atoml][2] -= tmp2*rln[2];
+              f[atomn][0] += tmp2*rln[0];
+              f[atomn][1] += tmp2*rln[1];
+              f[atomn][2] += tmp2*rln[2];
+
+              if (vflag_atom) v_tally2_thr(atoml,atomn,-tmp2,rln,thr);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  bij = (0.5*(pij+pji))+piRC+(Tij*Etmp);
+  return bij;
+}
+
+/* ----------------------------------------------------------------------
+   Bij* function
+------------------------------------------------------------------------- */
+
+double PairAIREBOOMP::bondorderLJ_thr(int i, int j, double rij[3], double rijmag,
+                                      double VA, double rij0[3], double rij0mag,
+                                      int vflag_atom, ThrData * const thr)
+{
+  int k,n,l,atomk,atoml,atomn,atom1,atom2,atom3,atom4;
+  int atomi,atomj,itype,jtype,ktype,ltype,ntype;
+  double rik[3], rjl[3], rkn[3],rknmag,dNki;
+  double NijC,NijH,NjiC,NjiH,wik,dwik,dwkn,wjl;
+  double rikmag,rjlmag,cosjik,cosijl,g,tmp2,tmp3;
+  double Etmp,pij,tmp,wij,dwij,NconjtmpI,NconjtmpJ;
+  double Nki,Nlj,dS,lamdajik,lamdaijl,dgdc,dgdN,pji,Nijconj,piRC;
+  double dcosjikdri[3],dcosijldri[3],dcosjikdrk[3];
+  double dN2[2],dN3[3];
+  double dcosijldrj[3],dcosijldrl[3],dcosjikdrj[3],dwjl;
+  double Tij,crosskij[3],crosskijmag;
+  double crossijl[3],crossijlmag,omkijl;
+  double tmppij,tmppji,dN2PIJ[2],dN2PJI[2],dN3piRC[3],dN3Tij[3];
+  double bij,tmp3pij,tmp3pji,Stb,dStb;
+  double r32[3],r32mag,cos321;
+  double om1234,rln[3];
+  double rlnmag,dwln,r23[3],r23mag,r21[3],r21mag;
+  double w21,dw21,r34[3],r34mag,cos234,w34,dw34;
+  double cross321[3],cross234[3],prefactor,SpN;
+  double fcijpc,fcikpc,fcjlpc,fcjkpc,fcilpc;
+  double dt2dik[3],dt2djl[3],dt2dij[3],aa,aaa1,aaa2,at2,cw,cwnum,cwnom;
+  double sin321,sin234,rr,rijrik,rijrjl,rjk2,rik2,ril2,rjl2;
+  double dctik,dctjk,dctjl,dctij,dctji,dctil,rik2i,rjl2i,sink2i,sinl2i;
+  double rjk[3],ril[3],dt1dik,dt1djk,dt1djl,dt1dil,dt1dij;
+  double dNlj;
+  double PijS,PjiS;
+  double rij2,tspjik,dtsjik,tspijl,dtsijl,costmp;
+  int *REBO_neighs,*REBO_neighs_i,*REBO_neighs_j,*REBO_neighs_k,*REBO_neighs_l;
+  double F12[3],F23[3],F34[3],F31[3],F24[3];
+  double fi[3],fj[3],fk[3],fl[3],f1[3],f2[3],f3[3],f4[4];
+  double rji[3],rki[3],rlj[3],r13[3],r43[3];
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+
+  atomi = i;
+  atomj = j;
+  itype = map[type[atomi]];
+  jtype = map[type[atomj]];
+  wij = Sp(rij0mag,rcmin[itype][jtype],rcmax[itype][jtype],dwij);
+  NijC = nC[atomi]-(wij*kronecker(jtype,0));
+  NijH = nH[atomi]-(wij*kronecker(jtype,1));
+  NjiC = nC[atomj]-(wij*kronecker(itype,0));
+  NjiH = nH[atomj]-(wij*kronecker(itype,1));
+
+  bij = 0.0;
+  tmp = 0.0;
+  tmp2 = 0.0;
+  tmp3 = 0.0;
+  dgdc = 0.0;
+  dgdN = 0.0;
+  NconjtmpI = 0.0;
+  NconjtmpJ = 0.0;
+  Etmp = 0.0;
+
+  REBO_neighs = REBO_firstneigh[i];
+  for (k = 0; k < REBO_numneigh[i]; k++) {
+    atomk = REBO_neighs[k];
+    if (atomk != atomj) {
+      ktype = map[type[atomk]];
+      rik[0] = x[atomi][0]-x[atomk][0];
+      rik[1] = x[atomi][1]-x[atomk][1];
+      rik[2] = x[atomi][2]-x[atomk][2];
+      rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+      lamdajik = 4.0*kronecker(itype,1) *
+        ((rho[ktype][1]-rikmag)-(rho[jtype][1]-rijmag));
+      wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dS);
+      Nki = nC[atomk]-(wik*kronecker(itype,0)) +
+        nH[atomk]-(wik*kronecker(itype,1));
+      cosjik = ((rij[0]*rik[0])+(rij[1]*rik[1])+(rij[2]*rik[2])) /
+        (rijmag*rikmag);
+      cosjik = MIN(cosjik,1.0);
+      cosjik = MAX(cosjik,-1.0);
+
+      // evaluate splines g and derivatives dg
+
+      g = gSpline(cosjik,(NijC+NijH),itype,&dgdc,&dgdN);
+      Etmp += (wik*g*exp(lamdajik));
+      tmp3 += (wik*dgdN*exp(lamdajik));
+      NconjtmpI = NconjtmpI+(kronecker(ktype,0)*wik*Sp(Nki,Nmin,Nmax,dS));
+    }
+  }
+
+  PijS = 0.0;
+  dN2PIJ[0] = 0.0;
+  dN2PIJ[1] = 0.0;
+  PijS = PijSpline(NijC,NijH,itype,jtype,dN2PIJ);
+  pij = 1.0/sqrt(1.0+Etmp+PijS);
+  tmppij = -.5*pij*pij*pij;
+  tmp3pij = tmp3;
+  tmp = 0.0;
+  tmp2 = 0.0;
+  tmp3 = 0.0;
+  Etmp = 0.0;
+
+  REBO_neighs = REBO_firstneigh[j];
+  for (l = 0; l < REBO_numneigh[j]; l++) {
+    atoml = REBO_neighs[l];
+    if (atoml != atomi) {
+      ltype = map[type[atoml]];
+      rjl[0] = x[atomj][0]-x[atoml][0];
+      rjl[1] = x[atomj][1]-x[atoml][1];
+      rjl[2] = x[atomj][2]-x[atoml][2];
+      rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+      lamdaijl = 4.0*kronecker(jtype,1) *
+        ((rho[ltype][1]-rjlmag)-(rho[itype][1]-rijmag));
+      wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dS);
+      Nlj = nC[atoml]-(wjl*kronecker(jtype,0))+nH[atoml] -
+        (wjl*kronecker(jtype,1));
+      cosijl = -1.0*((rij[0]*rjl[0])+(rij[1]*rjl[1])+(rij[2]*rjl[2])) /
+        (rijmag*rjlmag);
+      cosijl = MIN(cosijl,1.0);
+      cosijl = MAX(cosijl,-1.0);
+
+      // evaluate splines g and derivatives dg
+
+      g = gSpline(cosijl,NjiC+NjiH,jtype,&dgdc,&dgdN);
+      Etmp = Etmp+(wjl*g*exp(lamdaijl));
+      tmp3 = tmp3+(wjl*dgdN*exp(lamdaijl));
+      NconjtmpJ = NconjtmpJ+(kronecker(ltype,0)*wjl*Sp(Nlj,Nmin,Nmax,dS));
+    }
+  }
+
+  PjiS = 0.0;
+  dN2PJI[0] = 0.0;
+  dN2PJI[1] = 0.0;
+  PjiS = PijSpline(NjiC,NjiH,jtype,itype,dN2PJI);
+  pji = 1.0/sqrt(1.0+Etmp+PjiS);
+  tmppji = -.5*pji*pji*pji;
+  tmp3pji = tmp3;
+
+  // evaluate Nij conj
+
+  Nijconj = 1.0+(NconjtmpI*NconjtmpI)+(NconjtmpJ*NconjtmpJ);
+  piRC = piRCSpline(NijC+NijH,NjiC+NjiH,Nijconj,itype,jtype,dN3piRC);
+  Tij = 0.0;
+  dN3Tij[0] = 0.0;
+  dN3Tij[1] = 0.0;
+  dN3Tij[2] = 0.0;
+  if (itype == 0 && jtype == 0)
+    Tij=TijSpline((NijC+NijH),(NjiC+NjiH),Nijconj,dN3Tij);
+
+  Etmp = 0.0;
+  if (fabs(Tij) > TOL) {
+    REBO_neighs_i = REBO_firstneigh[i];
+    for (k = 0; k < REBO_numneigh[i]; k++) {
+      atomk = REBO_neighs_i[k];
+      ktype = map[type[atomk]];
+      if (atomk != atomj) {
+        rik[0] = x[atomi][0]-x[atomk][0];
+        rik[1] = x[atomi][1]-x[atomk][1];
+        rik[2] = x[atomi][2]-x[atomk][2];
+        rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+        cos321 = ((rij[0]*rik[0])+(rij[1]*rik[1])+(rij[2]*rik[2])) /
+          (rijmag*rikmag);
+        cos321 = MIN(cos321,1.0);
+        cos321 = MAX(cos321,-1.0);
+
+        rjk[0] = rik[0]-rij[0];
+        rjk[1] = rik[1]-rij[1];
+        rjk[2] = rik[2]-rij[2];
+        rjk2 = (rjk[0]*rjk[0])+(rjk[1]*rjk[1])+(rjk[2]*rjk[2]);
+        rij2 = rijmag*rijmag;
+        rik2 = rikmag*rikmag;
+        costmp = 0.5*(rij2+rik2-rjk2)/rijmag/rikmag;
+        tspjik = Sp2(costmp,thmin,thmax,dtsjik);
+
+        if (sqrt(1.0 - cos321*cos321) > sqrt(TOL)) {
+          wik = Sp(rikmag,rcmin[itype][ktype],rcmaxp[itype][ktype],dwik);
+          REBO_neighs_j = REBO_firstneigh[j];
+          for (l = 0; l < REBO_numneigh[j]; l++) {
+            atoml = REBO_neighs_j[l];
+            ltype = map[type[atoml]];
+            if (!(atoml == atomi || atoml == atomk)) {
+              rjl[0] = x[atomj][0]-x[atoml][0];
+              rjl[1] = x[atomj][1]-x[atoml][1];
+              rjl[2] = x[atomj][2]-x[atoml][2];
+              rjlmag = sqrt(rjl[0]*rjl[0] + rjl[1]*rjl[1] + rjl[2]*rjl[2]);
+              cos234 = -((rij[0]*rjl[0])+(rij[1]*rjl[1])+(rij[2]*rjl[2])) /
+                (rijmag*rjlmag);
+              cos234 = MIN(cos234,1.0);
+              cos234 = MAX(cos234,-1.0);
+
+              ril[0] = rij[0]+rjl[0];
+              ril[1] = rij[1]+rjl[1];
+              ril[2] = rij[2]+rjl[2];
+              ril2 = (ril[0]*ril[0])+(ril[1]*ril[1])+(ril[2]*ril[2]);
+              rijrjl = 2.0*rijmag*rjlmag;
+              rjl2 = rjlmag*rjlmag;
+              costmp = 0.5*(rij2+rjl2-ril2)/rijmag/rjlmag;
+              tspijl = Sp2(costmp,thmin,thmax,dtsijl);
+
+              if (sqrt(1.0 - cos234*cos234) > sqrt(TOL)) {
+                wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmaxp[jtype][ltype],dS);
+                crosskij[0] = (rij[1]*rik[2]-rij[2]*rik[1]);
+                crosskij[1] = (rij[2]*rik[0]-rij[0]*rik[2]);
+                crosskij[2] = (rij[0]*rik[1]-rij[1]*rik[0]);
+                crosskijmag = sqrt(crosskij[0]*crosskij[0] +
+                                   crosskij[1]*crosskij[1] +
+                                   crosskij[2]*crosskij[2]);
+                crossijl[0] = (rij[1]*rjl[2]-rij[2]*rjl[1]);
+                crossijl[1] = (rij[2]*rjl[0]-rij[0]*rjl[2]);
+                crossijl[2] = (rij[0]*rjl[1]-rij[1]*rjl[0]);
+                crossijlmag = sqrt(crossijl[0]*crossijl[0] +
+                                   crossijl[1]*crossijl[1] +
+                                   crossijl[2]*crossijl[2]);
+                omkijl = -1.0*(((crosskij[0]*crossijl[0]) +
+                                (crosskij[1]*crossijl[1]) +
+                                (crosskij[2]*crossijl[2])) /
+                               (crosskijmag*crossijlmag));
+                Etmp += ((1.0-(omkijl*omkijl))*wik*wjl) *
+                  (1.0-tspjik)*(1.0-tspijl);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  bij = (.5*(pij+pji))+piRC+(Tij*Etmp);
+  Stb = Sp2(bij,bLJmin[itype][jtype],bLJmax[itype][jtype],dStb);
+  VA = VA*dStb;
+
+  if (dStb != 0.0) {
+    tmp = tmppij;
+    dN2[0] = dN2PIJ[0];
+    dN2[1] = dN2PIJ[1];
+    tmp3 = tmp3pij;
+
+    const double invrijm = 1.0/rijmag;
+    const double invrijm2 = invrijm*invrijm;
+
+    // pij forces
+
+    REBO_neighs_i = REBO_firstneigh[i];
+    for (k = 0; k < REBO_numneigh[i]; k++) {
+      atomk = REBO_neighs_i[k];
+      if (atomk != atomj) {
+        lamdajik = 0.0;
+        rik[0] = x[atomi][0]-x[atomk][0];
+        rik[1] = x[atomi][1]-x[atomk][1];
+        rik[2] = x[atomi][2]-x[atomk][2];
+        rikmag = sqrt(rik[0]*rik[0] + rik[1]*rik[1] + rik[2]*rik[2]);
+        lamdajik = 4.0*kronecker(itype,1) *
+          ((rho[ktype][1]-rikmag)-(rho[jtype][1]-rijmag));
+        wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dwik);
+
+        const double invrikm = 1.0/rikmag;
+        const double invrijkm = invrijm*invrikm;
+        const double invrikm2 = invrikm*invrikm;
+
+        cosjik = ((rij[0]*rik[0])+(rij[1]*rik[1])+(rij[2]*rik[2]))
+          * invrijkm;
+        cosjik = MIN(cosjik,1.0);
+        cosjik = MAX(cosjik,-1.0);
+
+        dcosjikdri[0] = ((rij[0]+rik[0])*invrijkm) -
+          (cosjik*((rij[0]*invrijm2)+(rik[0]*invrikm2)));
+        dcosjikdri[1] = ((rij[1]+rik[1])*invrijkm) -
+          (cosjik*((rij[1]*invrijm2)+(rik[1]*invrikm2)));
+        dcosjikdri[2] = ((rij[2]+rik[2])*invrijkm) -
+          (cosjik*((rij[2]*invrijm2)+(rik[2]*invrikm2)));
+        dcosjikdrk[0] = (-rij[0]*invrijkm) + (cosjik*(rik[0]*invrikm2));
+        dcosjikdrk[1] = (-rij[1]*invrijkm) + (cosjik*(rik[1]*invrikm2));
+        dcosjikdrk[2] = (-rij[2]*invrijkm) + (cosjik*(rik[2]*invrikm2));
+        dcosjikdrj[0] = (-rik[0]*invrijkm) + (cosjik*(rij[0]*invrijm2));
+        dcosjikdrj[1] = (-rik[1]*invrijkm) + (cosjik*(rij[1]*invrijm2));
+        dcosjikdrj[2] = (-rik[2]*invrijkm) + (cosjik*(rij[2]*invrijm2));
+
+        g = gSpline(cosjik,(NijC+NijH),itype,&dgdc,&dgdN);
+
+        tmp2 = VA*.5*(tmp*wik*dgdc*exp(lamdajik));
+        fj[0] = -tmp2*dcosjikdrj[0];
+        fj[1] = -tmp2*dcosjikdrj[1];
+        fj[2] = -tmp2*dcosjikdrj[2];
+        fi[0] = -tmp2*dcosjikdri[0];
+        fi[1] = -tmp2*dcosjikdri[1];
+        fi[2] = -tmp2*dcosjikdri[2];
+        fk[0] = -tmp2*dcosjikdrk[0];
+        fk[1] = -tmp2*dcosjikdrk[1];
+        fk[2] = -tmp2*dcosjikdrk[2];
+
+        tmp2 = VA*.5*(tmp*wik*g*exp(lamdajik)*4.0*kronecker(itype,1));
+        fj[0] -= tmp2*(-rij[0]*invrijm);
+        fj[1] -= tmp2*(-rij[1]*invrijm);
+        fj[2] -= tmp2*(-rij[2]*invrijm);
+        fi[0] -= tmp2*((-rik[0]/rikmag)+(rij[0]*invrijm));
+        fi[1] -= tmp2*((-rik[1]/rikmag)+(rij[1]*invrijm));
+        fi[2] -= tmp2*((-rik[2]/rikmag)+(rij[2]*invrijm));
+        fk[0] -= tmp2*(rik[0]/rikmag);
+        fk[1] -= tmp2*(rik[1]/rikmag);
+        fk[2] -= tmp2*(rik[2]/rikmag);
+
+        // coordination forces
+
+        // dwik forces
+
+        tmp2 = VA*.5*(tmp*dwik*g*exp(lamdajik))/rikmag;
+        fi[0] -= tmp2*rik[0];
+        fi[1] -= tmp2*rik[1];
+        fi[2] -= tmp2*rik[2];
+        fk[0] += tmp2*rik[0];
+        fk[1] += tmp2*rik[1];
+        fk[2] += tmp2*rik[2];
+
+        // PIJ forces
+
+        tmp2 = VA*.5*(tmp*dN2[ktype]*dwik)/rikmag;
+        fi[0] -= tmp2*rik[0];
+        fi[1] -= tmp2*rik[1];
+        fi[2] -= tmp2*rik[2];
+        fk[0] += tmp2*rik[0];
+        fk[1] += tmp2*rik[1];
+        fk[2] += tmp2*rik[2];
+
+        // dgdN forces
+
+        tmp2 = VA*.5*(tmp*tmp3*dwik)/rikmag;
+        fi[0] -= tmp2*rik[0];
+        fi[1] -= tmp2*rik[1];
+        fi[2] -= tmp2*rik[2];
+        fk[0] += tmp2*rik[0];
+        fk[1] += tmp2*rik[1];
+        fk[2] += tmp2*rik[2];
+
+        f[atomi][0] += fi[0]; f[atomi][1] += fi[1]; f[atomi][2] += fi[2];
+        f[atomj][0] += fj[0]; f[atomj][1] += fj[1]; f[atomj][2] += fj[2];
+        f[atomk][0] += fk[0]; f[atomk][1] += fk[1]; f[atomk][2] += fk[2];
+
+        if (vflag_atom) {
+          rji[0] = -rij[0]; rji[1] = -rij[1]; rji[2] = -rij[2];
+          rki[0] = -rik[0]; rki[1] = -rik[1]; rki[2] = -rik[2];
+          v_tally3_thr(atomi,atomj,atomk,fj,fk,rji,rki,thr);
+        }
+      }
+    }
+
+    tmp = tmppji;
+    tmp3 = tmp3pji;
+    dN2[0] = dN2PJI[0];
+    dN2[1] = dN2PJI[1];
+    REBO_neighs  =  REBO_firstneigh[j];
+    for (l = 0; l < REBO_numneigh[j]; l++) {
+      atoml = REBO_neighs[l];
+      if (atoml !=atomi) {
+        ltype = map[type[atoml]];
+        rjl[0] = x[atomj][0]-x[atoml][0];
+        rjl[1] = x[atomj][1]-x[atoml][1];
+        rjl[2] = x[atomj][2]-x[atoml][2];
+        rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+        lamdaijl = 4.0*kronecker(jtype,1) *
+          ((rho[ltype][1]-rjlmag)-(rho[itype][1]-rijmag));
+        wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dwjl);
+
+        const double invrjlm = 1.0/rjlmag;
+        const double invrijlm = invrijm*invrjlm;
+        const double invrjlm2 = invrjlm*invrjlm;
+
+        cosijl = (-1.0*((rij[0]*rjl[0])+(rij[1]*rjl[1])+(rij[2]*rjl[2]))) *
+          invrijlm;
+        cosijl = MIN(cosijl,1.0);
+        cosijl = MAX(cosijl,-1.0);
+
+        dcosijldri[0] = (-rjl[0]*invrijlm) - (cosijl*rij[0]*invrijm2);
+        dcosijldri[1] = (-rjl[1]*invrijlm) - (cosijl*rij[1]*invrijm2);
+        dcosijldri[2] = (-rjl[2]*invrijlm) - (cosijl*rij[2]*invrijm2);
+        dcosijldrj[0] = ((-rij[0]+rjl[0])*invrijlm) +
+          (cosijl*((rij[0]*invrijm2)-(rjl[0]*invrjlm2)));
+        dcosijldrj[1] = ((-rij[1]+rjl[1])*invrijlm) +
+          (cosijl*((rij[1]*invrijm2)-(rjl[1]*invrjlm2)));
+        dcosijldrj[2] = ((-rij[2]+rjl[2])*invrijlm) +
+          (cosijl*((rij[2]*invrijm2)-(rjl[2]*invrjlm2)));
+        dcosijldrl[0] = (rij[0]*invrijlm) + (cosijl*rjl[0]*invrjlm2);
+        dcosijldrl[1] = (rij[1]*invrijlm) + (cosijl*rjl[1]*invrjlm2);
+        dcosijldrl[2] = (rij[2]*invrijlm) + (cosijl*rjl[2]*invrjlm2);
+
+        // evaluate splines g and derivatives dg
+
+        g = gSpline(cosijl,NjiC+NjiH,jtype,&dgdc,&dgdN);
+        tmp2 = VA*.5*(tmp*wjl*dgdc*exp(lamdaijl));
+        fi[0] = -tmp2*dcosijldri[0];
+        fi[1] = -tmp2*dcosijldri[1];
+        fi[2] = -tmp2*dcosijldri[2];
+        fj[0] = -tmp2*dcosijldrj[0];
+        fj[1] = -tmp2*dcosijldrj[1];
+        fj[2] = -tmp2*dcosijldrj[2];
+        fl[0] = -tmp2*dcosijldrl[0];
+        fl[1] = -tmp2*dcosijldrl[1];
+        fl[2] = -tmp2*dcosijldrl[2];
+
+        tmp2 = VA*.5*(tmp*wjl*g*exp(lamdaijl)*4.0*kronecker(jtype,1));
+        fi[0] -= tmp2*(rij[0]*invrijm);
+        fi[1] -= tmp2*(rij[1]*invrijm);
+        fi[2] -= tmp2*(rij[2]*invrijm);
+        fj[0] -= tmp2*((-rjl[0]*invrjlm)-(rij[0]*invrijm));
+        fj[1] -= tmp2*((-rjl[1]*invrjlm)-(rij[1]*invrijm));
+        fj[2] -= tmp2*((-rjl[2]*invrjlm)-(rij[2]*invrijm));
+        fl[0] -= tmp2*(rjl[0]*invrjlm);
+        fl[1] -= tmp2*(rjl[1]*invrjlm);
+        fl[2] -= tmp2*(rjl[2]*invrjlm);
+
+         // coordination forces
+        // dwik forces
+
+        tmp2 = VA*.5*(tmp*dwjl*g*exp(lamdaijl))*invrjlm;
+        fj[0] -= tmp2*rjl[0];
+        fj[1] -= tmp2*rjl[1];
+        fj[2] -= tmp2*rjl[2];
+        fl[0] += tmp2*rjl[0];
+        fl[1] += tmp2*rjl[1];
+        fl[2] += tmp2*rjl[2];
+
+        // PIJ forces
+
+        tmp2 = VA*.5*(tmp*dN2[ltype]*dwjl)*invrjlm;
+        fj[0] -= tmp2*rjl[0];
+        fj[1] -= tmp2*rjl[1];
+        fj[2] -= tmp2*rjl[2];
+        fl[0] += tmp2*rjl[0];
+        fl[1] += tmp2*rjl[1];
+        fl[2] += tmp2*rjl[2];
+
+        // dgdN forces
+
+        tmp2=VA*.5*(tmp*tmp3*dwjl)*invrjlm;
+        fj[0] -= tmp2*rjl[0];
+        fj[1] -= tmp2*rjl[1];
+        fj[2] -= tmp2*rjl[2];
+        fl[0] += tmp2*rjl[0];
+        fl[1] += tmp2*rjl[1];
+        fl[2] += tmp2*rjl[2];
+
+        f[atomi][0] += fi[0]; f[atomi][1] += fi[1]; f[atomi][2] += fi[2];
+        f[atomj][0] += fj[0]; f[atomj][1] += fj[1]; f[atomj][2] += fj[2];
+        f[atoml][0] += fl[0]; f[atoml][1] += fl[1]; f[atoml][2] += fl[2];
+
+        if (vflag_atom) {
+          rlj[0] = -rjl[0]; rlj[1] = -rjl[1]; rlj[2] = -rjl[2];
+          v_tally3_thr(atomi,atomj,atoml,fi,fl,rij,rlj,thr);
+        }
+      }
+    }
+
+    // piRC forces
+
+    dN3[0] = dN3piRC[0];
+    dN3[1] = dN3piRC[1];
+    dN3[2] = dN3piRC[2];
+
+    REBO_neighs_i = REBO_firstneigh[i];
+    for (k = 0; k < REBO_numneigh[i]; k++) {
+      atomk = REBO_neighs_i[k];
+      if (atomk != atomj) {
+        ktype = map[type[atomk]];
+        rik[0] = x[atomi][0]-x[atomk][0];
+        rik[1] = x[atomi][1]-x[atomk][1];
+        rik[2] = x[atomi][2]-x[atomk][2];
+        rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+        wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dwik);
+        Nki = nC[atomk]-(wik*kronecker(itype,0))+nH[atomk] -
+          (wik*kronecker(itype,1));
+        SpN = Sp(Nki,Nmin,Nmax,dNki);
+
+        tmp2 = VA*dN3[0]*dwik/rikmag;
+        f[atomi][0] -= tmp2*rik[0];
+        f[atomi][1] -= tmp2*rik[1];
+        f[atomi][2] -= tmp2*rik[2];
+        f[atomk][0] += tmp2*rik[0];
+        f[atomk][1] += tmp2*rik[1];
+        f[atomk][2] += tmp2*rik[2];
+
+        if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+        tmp2 = VA*dN3[2]*(2.0*NconjtmpI*dwik*SpN)/rikmag;
+        f[atomi][0] -= tmp2*rik[0];
+        f[atomi][1] -= tmp2*rik[1];
+        f[atomi][2] -= tmp2*rik[2];
+        f[atomk][0] += tmp2*rik[0];
+        f[atomk][1] += tmp2*rik[1];
+        f[atomk][2] += tmp2*rik[2];
+
+        if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+        if (fabs(dNki) > TOL) {
+          REBO_neighs_k = REBO_firstneigh[atomk];
+          for (n = 0; n < REBO_numneigh[atomk]; n++) {
+            atomn = REBO_neighs_k[n];
+            if (atomn != atomi) {
+              ntype = map[type[atomn]];
+              rkn[0] = x[atomk][0]-x[atomn][0];
+              rkn[1] = x[atomk][1]-x[atomn][1];
+              rkn[2] = x[atomk][2]-x[atomn][2];
+              rknmag = sqrt((rkn[0]*rkn[0])+(rkn[1]*rkn[1])+(rkn[2]*rkn[2]));
+              Sp(rknmag,rcmin[ktype][ntype],rcmax[ktype][ntype],dwkn);
+
+              tmp2 = VA*dN3[2]*(2.0*NconjtmpI*wik*dNki*dwkn)/rknmag;
+              f[atomk][0] -= tmp2*rkn[0];
+              f[atomk][1] -= tmp2*rkn[1];
+              f[atomk][2] -= tmp2*rkn[2];
+              f[atomn][0] += tmp2*rkn[0];
+              f[atomn][1] += tmp2*rkn[1];
+              f[atomn][2] += tmp2*rkn[2];
+
+              if (vflag_atom) v_tally2_thr(atomk,atomn,-tmp2,rkn,thr);
+            }
+          }
+        }
+      }
+    }
+
+    // piRC forces to J side
+
+    REBO_neighs = REBO_firstneigh[j];
+    for (l = 0; l < REBO_numneigh[j]; l++) {
+      atoml = REBO_neighs[l];
+      if (atoml != atomi) {
+        ltype = map[type[atoml]];
+        rjl[0] = x[atomj][0]-x[atoml][0];
+        rjl[1] = x[atomj][1]-x[atoml][1];
+        rjl[2] = x[atomj][2]-x[atoml][2];
+        rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+        wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dwjl);
+        Nlj = nC[atoml]-(wjl*kronecker(jtype,0))+nH[atoml] -
+          (wjl*kronecker(jtype,1));
+        SpN = Sp(Nlj,Nmin,Nmax,dNlj);
+
+        tmp2 = VA*dN3[1]*dwjl/rjlmag;
+        f[atomj][0] -= tmp2*rjl[0];
+        f[atomj][1] -= tmp2*rjl[1];
+        f[atomj][2] -= tmp2*rjl[2];
+        f[atoml][0] += tmp2*rjl[0];
+        f[atoml][1] += tmp2*rjl[1];
+        f[atoml][2] += tmp2*rjl[2];
+
+        if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+        tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*dwjl*SpN)/rjlmag;
+        f[atomj][0] -= tmp2*rjl[0];
+        f[atomj][1] -= tmp2*rjl[1];
+        f[atomj][2] -= tmp2*rjl[2];
+        f[atoml][0] += tmp2*rjl[0];
+        f[atoml][1] += tmp2*rjl[1];
+        f[atoml][2] += tmp2*rjl[2];
+
+        if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+        if (fabs(dNlj) > TOL) {
+          REBO_neighs_l = REBO_firstneigh[atoml];
+          for (n = 0; n < REBO_numneigh[atoml]; n++) {
+            atomn = REBO_neighs_l[n];
+            if (atomn != atomj) {
+              ntype = map[type[atomn]];
+              rln[0] = x[atoml][0]-x[atomn][0];
+              rln[1] = x[atoml][1]-x[atomn][1];
+              rln[2] = x[atoml][2]-x[atomn][2];
+              rlnmag = sqrt((rln[0]*rln[0])+(rln[1]*rln[1])+(rln[2]*rln[2]));
+              Sp(rlnmag,rcmin[ltype][ntype],rcmax[ltype][ntype],dwln);
+
+              tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*wjl*dNlj*dwln)/rlnmag;
+              f[atoml][0] -= tmp2*rln[0];
+              f[atoml][1] -= tmp2*rln[1];
+              f[atoml][2] -= tmp2*rln[2];
+              f[atomn][0] += tmp2*rln[0];
+              f[atomn][1] += tmp2*rln[1];
+              f[atomn][2] += tmp2*rln[2];
+
+              if (vflag_atom) v_tally2_thr(atoml,atomn,-tmp2,rln,thr);
+            }
+          }
+        }
+      }
+    }
+
+    if (fabs(Tij) > TOL) {
+      dN3[0] = dN3Tij[0];
+      dN3[1] = dN3Tij[1];
+      dN3[2] = dN3Tij[2];
+      atom2 = atomi;
+      atom3 = atomj;
+      r32[0] = x[atom3][0]-x[atom2][0];
+      r32[1] = x[atom3][1]-x[atom2][1];
+      r32[2] = x[atom3][2]-x[atom2][2];
+      r32mag = sqrt((r32[0]*r32[0])+(r32[1]*r32[1])+(r32[2]*r32[2]));
+      r23[0] = -r32[0];
+      r23[1] = -r32[1];
+      r23[2] = -r32[2];
+      r23mag = r32mag;
+
+      REBO_neighs_i = REBO_firstneigh[i];
+      for (k = 0; k < REBO_numneigh[i]; k++) {
+        atomk = REBO_neighs_i[k];
+        atom1 = atomk;
+        ktype = map[type[atomk]];
+        if (atomk != atomj) {
+          r21[0] = x[atom2][0]-x[atom1][0];
+          r21[1] = x[atom2][1]-x[atom1][1];
+          r21[2] = x[atom2][2]-x[atom1][2];
+          r21mag = sqrt(r21[0]*r21[0] + r21[1]*r21[1] + r21[2]*r21[2]);
+          cos321 = ((r21[0]*rij[0])+(r21[1]*rij[1])+(r21[2]*rij[2])) /
+            (r21mag*rijmag);
+          cos321 = MIN(cos321,1.0);
+          cos321 = MAX(cos321,-1.0);
+          sin321 = sqrt(1.0 - cos321*cos321);
+          sink2i = 1.0/(sin321*sin321);
+          rik2i = 1.0/(r21mag*r21mag);
+
+          if (sin321 != 0.0) {
+            rr = (rijmag*rijmag)-(r21mag*r21mag);
+            rjk[0] = r21[0]-rij[0];
+            rjk[1] = r21[1]-rij[1];
+            rjk[2] = r21[2]-rij[2];
+            rjk2 = (rjk[0]*rjk[0])+(rjk[1]*rjk[1])+(rjk[2]*rjk[2]);
+            rijrik = 2.0*rijmag*r21mag;
+            rik2 = r21mag*r21mag;
+            dctik = (-rr+rjk2)/(rijrik*rik2);
+            dctij = (rr+rjk2)/(rijrik*rijmag*rijmag);
+            dctjk = -2.0/rijrik;
+            w21 = Sp(r21mag,rcmin[itype][ktype],rcmaxp[itype][ktype],dw21);
+            rikmag = r21mag;
+            rij2 = r32mag*r32mag;
+            rik2 = r21mag*r21mag;
+            costmp = 0.5*(rij2+rik2-rjk2)/rijmag/rikmag;
+            tspjik = Sp2(costmp,thmin,thmax,dtsjik);
+            dtsjik = -dtsjik;
+
+            REBO_neighs_j = REBO_firstneigh[j];
+            for (l = 0; l < REBO_numneigh[j]; l++) {
+              atoml = REBO_neighs_j[l];
+              atom4 = atoml;
+              ltype = map[type[atoml]];
+              if (!(atoml == atomi || atoml == atomk)) {
+                r34[0] = x[atom3][0]-x[atom4][0];
+                r34[1] = x[atom3][1]-x[atom4][1];
+                r34[2] = x[atom3][2]-x[atom4][2];
+                r34mag = sqrt(r34[0]*r34[0] + r34[1]*r34[1] + r34[2]*r34[2]);
+                cos234 = -1.0*((rij[0]*r34[0])+(rij[1]*r34[1]) +
+                               (rij[2]*r34[2]))/(rijmag*r34mag);
+                cos234 = MIN(cos234,1.0);
+                cos234 = MAX(cos234,-1.0);
+                sin234 = sqrt(1.0 - cos234*cos234);
+                sinl2i = 1.0/(sin234*sin234);
+                rjl2i = 1.0/(r34mag*r34mag);
+
+                if (sin234 != 0.0) {
+                  w34 = Sp(r34mag,rcmin[jtype][ltype],
+                           rcmaxp[jtype][ltype],dw34);
+                  rr = (r23mag*r23mag)-(r34mag*r34mag);
+                  ril[0] = r23[0]+r34[0];
+                  ril[1] = r23[1]+r34[1];
+                  ril[2] = r23[2]+r34[2];
+                  ril2 = (ril[0]*ril[0])+(ril[1]*ril[1])+(ril[2]*ril[2]);
+                  rijrjl = 2.0*r23mag*r34mag;
+                  rjl2 = r34mag*r34mag;
+                  dctjl = (-rr+ril2)/(rijrjl*rjl2);
+                  dctji = (rr+ril2)/(rijrjl*r23mag*r23mag);
+                  dctil = -2.0/rijrjl;
+                  rjlmag = r34mag;
+                  rjl2 = r34mag*r34mag;
+                  costmp = 0.5*(rij2+rjl2-ril2)/rijmag/rjlmag;
+                  tspijl = Sp2(costmp,thmin,thmax,dtsijl);
+                  dtsijl = -dtsijl; //need minus sign
+                  prefactor = VA*Tij;
+
+                  cross321[0] = (r32[1]*r21[2])-(r32[2]*r21[1]);
+                  cross321[1] = (r32[2]*r21[0])-(r32[0]*r21[2]);
+                  cross321[2] = (r32[0]*r21[1])-(r32[1]*r21[0]);
+                  cross234[0] = (r23[1]*r34[2])-(r23[2]*r34[1]);
+                  cross234[1] = (r23[2]*r34[0])-(r23[0]*r34[2]);
+                  cross234[2] = (r23[0]*r34[1])-(r23[1]*r34[0]);
+
+                  cwnum = (cross321[0]*cross234[0]) +
+                    (cross321[1]*cross234[1])+(cross321[2]*cross234[2]);
+                  cwnom = r21mag*r34mag*r23mag*r23mag*sin321*sin234;
+                  om1234 = cwnum/cwnom;
+                  cw = om1234;
+                  Etmp += ((1.0-(om1234*om1234))*w21*w34) *
+                    (1.0-tspjik)*(1.0-tspijl);
+
+                  dt1dik = (rik2i)-(dctik*sink2i*cos321);
+                  dt1djk = (-dctjk*sink2i*cos321);
+                  dt1djl = (rjl2i)-(dctjl*sinl2i*cos234);
+                  dt1dil = (-dctil*sinl2i*cos234);
+                  dt1dij = (2.0/(r23mag*r23mag)) -
+                    (dctij*sink2i*cos321)-(dctji*sinl2i*cos234);
+
+                  dt2dik[0] = (-r23[2]*cross234[1])+(r23[1]*cross234[2]);
+                  dt2dik[1] = (-r23[0]*cross234[2])+(r23[2]*cross234[0]);
+                  dt2dik[2] = (-r23[1]*cross234[0])+(r23[0]*cross234[1]);
+
+                  dt2djl[0] = (-r23[1]*cross321[2])+(r23[2]*cross321[1]);
+                  dt2djl[1] = (-r23[2]*cross321[0])+(r23[0]*cross321[2]);
+                  dt2djl[2] = (-r23[0]*cross321[1])+(r23[1]*cross321[0]);
+
+                  dt2dij[0] = (r21[2]*cross234[1]) -
+                    (r34[2]*cross321[1])-(r21[1]*cross234[2]) +
+                    (r34[1]*cross321[2]);
+                  dt2dij[1] = (r21[0]*cross234[2]) -
+                    (r34[0]*cross321[2])-(r21[2]*cross234[0]) +
+                    (r34[2]*cross321[0]);
+                  dt2dij[2] = (r21[1]*cross234[0]) -
+                    (r34[1]*cross321[0])-(r21[0]*cross234[1]) +
+                    (r34[0]*cross321[1]);
+
+                  aa = (prefactor*2.0*cw/cwnom)*w21*w34 *
+                    (1.0-tspjik)*(1.0-tspijl);
+                  aaa1 = -prefactor*(1.0-(om1234*om1234)) *
+                    (1.0-tspjik)*(1.0-tspijl);
+                  aaa2 = aaa1*w21*w34;
+                  at2 = aa*cwnum;
+
+                  fcijpc = (-dt1dij*at2)+(aaa2*dtsjik*dctij*(1.0-tspijl)) +
+                    (aaa2*dtsijl*dctji*(1.0-tspjik));
+                  fcikpc = (-dt1dik*at2)+(aaa2*dtsjik*dctik*(1.0-tspijl));
+                  fcjlpc = (-dt1djl*at2)+(aaa2*dtsijl*dctjl*(1.0-tspjik));
+                  fcjkpc = (-dt1djk*at2)+(aaa2*dtsjik*dctjk*(1.0-tspijl));
+                  fcilpc = (-dt1dil*at2)+(aaa2*dtsijl*dctil*(1.0-tspjik));
+
+                  F23[0] = (fcijpc*r23[0])+(aa*dt2dij[0]);
+                  F23[1] = (fcijpc*r23[1])+(aa*dt2dij[1]);
+                  F23[2] = (fcijpc*r23[2])+(aa*dt2dij[2]);
+
+                  F12[0] = (fcikpc*r21[0])+(aa*dt2dik[0]);
+                  F12[1] = (fcikpc*r21[1])+(aa*dt2dik[1]);
+                  F12[2] = (fcikpc*r21[2])+(aa*dt2dik[2]);
+
+                  F34[0] = (fcjlpc*r34[0])+(aa*dt2djl[0]);
+                  F34[1] = (fcjlpc*r34[1])+(aa*dt2djl[1]);
+                  F34[2] = (fcjlpc*r34[2])+(aa*dt2djl[2]);
+
+                  F31[0] = (fcjkpc*rjk[0]);
+                  F31[1] = (fcjkpc*rjk[1]);
+                  F31[2] = (fcjkpc*rjk[2]);
+
+                  F24[0] = (fcilpc*ril[0]);
+                  F24[1] = (fcilpc*ril[1]);
+                  F24[2] = (fcilpc*ril[2]);
+
+                  f1[0] = -F12[0]-F31[0];
+                  f1[1] = -F12[1]-F31[1];
+                  f1[2] = -F12[2]-F31[2];
+                  f2[0] = F23[0]+F12[0]+F24[0];
+                  f2[1] = F23[1]+F12[1]+F24[1];
+                  f2[2] = F23[2]+F12[2]+F24[2];
+                  f3[0] = -F23[0]+F34[0]+F31[0];
+                  f3[1] = -F23[1]+F34[1]+F31[1];
+                  f3[2] = -F23[2]+F34[2]+F31[2];
+                  f4[0] = -F34[0]-F24[0];
+                  f4[1] = -F34[1]-F24[1];
+                  f4[2] = -F34[2]-F24[2];
+
+                  // coordination forces
+
+                  tmp2 = VA*Tij*((1.0-(om1234*om1234))) *
+                    (1.0-tspjik)*(1.0-tspijl)*dw21*w34/r21mag;
+                  f2[0] -= tmp2*r21[0];
+                  f2[1] -= tmp2*r21[1];
+                  f2[2] -= tmp2*r21[2];
+                  f1[0] += tmp2*r21[0];
+                  f1[1] += tmp2*r21[1];
+                  f1[2] += tmp2*r21[2];
+
+                  tmp2 = VA*Tij*((1.0-(om1234*om1234))) *
+                    (1.0-tspjik)*(1.0-tspijl)*w21*dw34/r34mag;
+                  f3[0] -= tmp2*r34[0];
+                  f3[1] -= tmp2*r34[1];
+                  f3[2] -= tmp2*r34[2];
+                  f4[0] += tmp2*r34[0];
+                  f4[1] += tmp2*r34[1];
+                  f4[2] += tmp2*r34[2];
+
+                  f[atom1][0] += f1[0]; f[atom1][1] += f1[1];
+                  f[atom1][2] += f1[2];
+                  f[atom2][0] += f2[0]; f[atom2][1] += f2[1];
+                  f[atom2][2] += f2[2];
+                  f[atom3][0] += f3[0]; f[atom3][1] += f3[1];
+                  f[atom3][2] += f3[2];
+                  f[atom4][0] += f4[0]; f[atom4][1] += f4[1];
+                  f[atom4][2] += f4[2];
+
+                  if (vflag_atom) {
+                    r13[0] = -rjk[0]; r13[1] = -rjk[1]; r13[2] = -rjk[2];
+                    r43[0] = -r34[0]; r43[1] = -r34[1]; r43[2] = -r34[2];
+                    v_tally4_thr(atom1,atom2,atom3,atom4,f1,f2,f4,r13,r23,r43,thr);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      REBO_neighs = REBO_firstneigh[i];
+      for (k = 0; k < REBO_numneigh[i]; k++) {
+        atomk = REBO_neighs[k];
+        if (atomk != atomj) {
+          ktype = map[type[atomk]];
+          rik[0] = x[atomi][0]-x[atomk][0];
+          rik[1] = x[atomi][1]-x[atomk][1];
+          rik[2] = x[atomi][2]-x[atomk][2];
+          rikmag = sqrt((rik[0]*rik[0])+(rik[1]*rik[1])+(rik[2]*rik[2]));
+          wik = Sp(rikmag,rcmin[itype][ktype],rcmax[itype][ktype],dwik);
+          Nki = nC[atomk]-(wik*kronecker(itype,0))+nH[atomk] -
+            (wik*kronecker(itype,1));
+          SpN = Sp(Nki,Nmin,Nmax,dNki);
+
+          tmp2 = VA*dN3[0]*dwik*Etmp/rikmag;
+          f[atomi][0] -= tmp2*rik[0];
+          f[atomi][1] -= tmp2*rik[1];
+          f[atomi][2] -= tmp2*rik[2];
+          f[atomk][0] += tmp2*rik[0];
+          f[atomk][1] += tmp2*rik[1];
+          f[atomk][2] += tmp2*rik[2];
+
+          if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+          tmp2 = VA*dN3[2]*(2.0*NconjtmpI*dwik*SpN)*Etmp/rikmag;
+          f[atomi][0] -= tmp2*rik[0];
+          f[atomi][1] -= tmp2*rik[1];
+          f[atomi][2] -= tmp2*rik[2];
+          f[atomk][0] += tmp2*rik[0];
+          f[atomk][1] += tmp2*rik[1];
+          f[atomk][2] += tmp2*rik[2];
+
+          if (vflag_atom) v_tally2_thr(atomi,atomk,-tmp2,rik,thr);
+
+          if (fabs(dNki)  >TOL) {
+            REBO_neighs_k = REBO_firstneigh[atomk];
+            for (n = 0; n < REBO_numneigh[atomk]; n++) {
+              atomn = REBO_neighs_k[n];
+              ntype = map[type[atomn]];
+              if (atomn !=atomi) {
+                rkn[0] = x[atomk][0]-x[atomn][0];
+                rkn[1] = x[atomk][1]-x[atomn][1];
+                rkn[2] = x[atomk][2]-x[atomn][2];
+                rknmag = sqrt((rkn[0]*rkn[0])+(rkn[1]*rkn[1])+(rkn[2]*rkn[2]));
+                Sp(rknmag,rcmin[ktype][ntype],rcmax[ktype][ntype],dwkn);
+
+                tmp2 = VA*dN3[2]*(2.0*NconjtmpI*wik*dNki*dwkn)*Etmp/rknmag;
+                f[atomk][0] -= tmp2*rkn[0];
+                f[atomk][1] -= tmp2*rkn[1];
+                f[atomk][2] -= tmp2*rkn[2];
+                f[atomn][0] += tmp2*rkn[0];
+                f[atomn][1] += tmp2*rkn[1];
+                f[atomn][2] += tmp2*rkn[2];
+
+                if (vflag_atom) v_tally2_thr(atomk,atomn,-tmp2,rkn,thr);
+              }
+            }
+          }
+        }
+      }
+
+      // Tij forces
+
+      REBO_neighs = REBO_firstneigh[j];
+      for (l = 0; l < REBO_numneigh[j]; l++) {
+        atoml = REBO_neighs[l];
+        if (atoml != atomi) {
+          ltype = map[type[atoml]];
+          rjl[0] = x[atomj][0]-x[atoml][0];
+          rjl[1] = x[atomj][1]-x[atoml][1];
+          rjl[2] = x[atomj][2]-x[atoml][2];
+          rjlmag = sqrt((rjl[0]*rjl[0])+(rjl[1]*rjl[1])+(rjl[2]*rjl[2]));
+          wjl = Sp(rjlmag,rcmin[jtype][ltype],rcmax[jtype][ltype],dwjl);
+          Nlj = nC[atoml]-(wjl*kronecker(jtype,0))+nH[atoml] -
+            (wjl*kronecker(jtype,1));
+          SpN = Sp(Nlj,Nmin,Nmax,dNlj);
+
+          tmp2 = VA*dN3[1]*dwjl*Etmp/rjlmag;
+          f[atomj][0] -= tmp2*rjl[0];
+          f[atomj][1] -= tmp2*rjl[1];
+          f[atomj][2] -= tmp2*rjl[2];
+          f[atoml][0] += tmp2*rjl[0];
+          f[atoml][1] += tmp2*rjl[1];
+          f[atoml][2] += tmp2*rjl[2];
+
+          if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+          tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*dwjl*SpN)*Etmp/rjlmag;
+          f[atomj][0] -= tmp2*rjl[0];
+          f[atomj][1] -= tmp2*rjl[1];
+          f[atomj][2] -= tmp2*rjl[2];
+          f[atoml][0] += tmp2*rjl[0];
+          f[atoml][1] += tmp2*rjl[1];
+          f[atoml][2] += tmp2*rjl[2];
+
+          if (vflag_atom) v_tally2_thr(atomj,atoml,-tmp2,rjl,thr);
+
+          if (fabs(dNlj) > TOL) {
+            REBO_neighs_l = REBO_firstneigh[atoml];
+            for (n = 0; n < REBO_numneigh[atoml]; n++) {
+              atomn = REBO_neighs_l[n];
+              ntype = map[type[atomn]];
+              if (atomn != atomj) {
+                rln[0] = x[atoml][0]-x[atomn][0];
+                rln[1] = x[atoml][1]-x[atomn][1];
+                rln[2] = x[atoml][2]-x[atomn][2];
+                rlnmag = sqrt((rln[0]*rln[0])+(rln[1]*rln[1])+(rln[2]*rln[2]));
+                Sp(rlnmag,rcmin[ltype][ntype],rcmax[ltype][ntype],dwln);
+
+                tmp2 = VA*dN3[2]*(2.0*NconjtmpJ*wjl*dNlj*dwln)*Etmp/rlnmag;
+                f[atoml][0] -= tmp2*rln[0];
+                f[atoml][1] -= tmp2*rln[1];
+                f[atoml][2] -= tmp2*rln[2];
+                f[atomn][0] += tmp2*rln[0];
+                f[atomn][1] += tmp2*rln[1];
+                f[atomn][2] += tmp2*rln[2];
+
+                if (vflag_atom) v_tally2_thr(atoml,atomn,-tmp2,rln,thr);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Stb;
+}
+
+/* ----------------------------------------------------------------------
+   REBO forces and energy
+------------------------------------------------------------------------- */
+
+void PairAIREBOOMP::FREBO_thr(int ifrom, int ito, int evflag, int eflag,
+                              int vflag_atom, ThrData * const thr)
+{
+  int i,j,k,m,ii,itype,jtype,itag,jtag;
+  double delx,dely,delz,evdwl,fpair,xtmp,ytmp,ztmp;
+  double rsq,rij,wij;
+  double Qij,Aij,alphaij,VR,pre,dVRdi,VA,term,bij,dVAdi,dVA;
+  double dwij,del[3];
+  int *ilist,*REBO_neighs;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int *type = atom->type;
+  int *tag = atom->tag;
+  int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+
+  // two-body interactions from REBO neighbor list, skip half of them
+
+  for (ii = ifrom; ii < ito; ii++) {
+    i = ilist[ii];
+    itag = tag[i];
+    itype = map[type[i]];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    REBO_neighs = REBO_firstneigh[i];
+
+    for (k = 0; k < REBO_numneigh[i]; k++) {
+      j = REBO_neighs[k];
+      jtag = tag[j];
+
+      if (itag > jtag) {
+        if ((itag+jtag) % 2 == 0) continue;
+      } else if (itag < jtag) {
+        if ((itag+jtag) % 2 == 1) continue;
+      } else {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp && x[j][1] < ytmp) continue;
+        if (x[j][2] == ztmp && x[j][1] == ytmp && x[j][0] < xtmp) continue;
+      }
+
+      jtype = map[type[j]];
+
+      delx = x[i][0] - x[j][0];
+      dely = x[i][1] - x[j][1];
+      delz = x[i][2] - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      rij = sqrt(rsq);
+      wij = Sp(rij,rcmin[itype][jtype],rcmax[itype][jtype],dwij);
+      if (wij <= TOL) continue;
+
+      Qij = Q[itype][jtype];
+      Aij = A[itype][jtype];
+      alphaij = alpha[itype][jtype];
+
+      VR = wij*(1.0+(Qij/rij)) * Aij*exp(-alphaij*rij);
+      pre = wij*Aij * exp(-alphaij*rij);
+      dVRdi = pre * ((-alphaij)-(Qij/rsq)-(Qij*alphaij/rij));
+      dVRdi += VR/wij * dwij;
+
+      VA = dVA = 0.0;
+      for (m = 0; m < 3; m++) {
+        term = -wij * BIJc[itype][jtype][m] * exp(-Beta[itype][jtype][m]*rij);
+        VA += term;
+        dVA += -Beta[itype][jtype][m] * term;
+      }
+      dVA += VA/wij * dwij;
+      del[0] = delx;
+      del[1] = dely;
+      del[2] = delz;
+      bij = bondorder_thr(i,j,del,rij,VA,vflag_atom,thr);
+      dVAdi = bij*dVA;
+
+      fpair = -(dVRdi+dVAdi) / rij;
+      f[i][0] += delx*fpair;
+      f[i][1] += dely*fpair;
+      f[i][2] += delz*fpair;
+      f[j][0] -= delx*fpair;
+      f[j][1] -= dely*fpair;
+      f[j][2] -= delz*fpair;
+
+      if (eflag) evdwl = VR + bij*VA;
+      if (evflag) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute LJ forces and energy
+   find 3- and 4-step paths between atoms I,J via REBO neighbor lists
+------------------------------------------------------------------------- */
+
+void PairAIREBOOMP::FLJ_thr(int ifrom, int ito, int evflag, int eflag,
+                            int vflag_atom, ThrData * const thr)
+{
+  int i,j,k,m,ii,jj,kk,mm,jnum,itype,jtype,ktype,mtype,itag,jtag;
+  int atomi,atomj,atomk,atomm;
+  int testpath,npath,done;
+  double evdwl,fpair,xtmp,ytmp,ztmp;
+  double rsq,best,wik,wkm,cij,rij,dwij,dwik,dwkj,dwkm,dwmj;
+  double delij[3],rijsq,delik[3],rik,deljk[3];
+  double rkj,wkj,dC,VLJ,dVLJ,VA,Str,dStr,Stb;
+  double vdw,slw,dvdw,dslw,drij,swidth,tee,tee2;
+  double rljmin,rljmax,sigcut,sigmin,sigwid;
+  double delkm[3],rkm,deljm[3],rmj,wmj,r2inv,r6inv,scale,delscale[3];
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *REBO_neighs_i,*REBO_neighs_k;
+  double delikS[3],deljkS[3],delkmS[3],deljmS[3],delimS[3];
+  double rikS,rkjS,rkmS,rmjS,wikS,dwikS;
+  double wkjS,dwkjS,wkmS,dwkmS,wmjS,dwmjS;
+  double fpair1,fpair2,fpair3;
+  double fi[3],fj[3],fk[3],fm[3];
+
+  // I-J interaction from full neighbor list
+  // skip 1/2 of interactions since only consider each pair once
+
+  evdwl = 0.0;
+  rljmin = 0.0;
+  rljmax = 0.0;
+  sigcut = 0.0;
+  sigmin = 0.0;
+  sigwid = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int *tag = atom->tag;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = ifrom; ii < ito; ii++) {
+    i = ilist[ii];
+    itag = tag[i];
+    itype = map[type[i]];
+    atomi = i;
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtag = tag[j];
+
+      if (itag > jtag) {
+        if ((itag+jtag) % 2 == 0) continue;
+      } else if (itag < jtag) {
+        if ((itag+jtag) % 2 == 1) continue;
+      } else {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp && x[j][1] < ytmp) continue;
+        if (x[j][2] == ztmp && x[j][1] == ytmp && x[j][0] < xtmp) continue;
+      }
+
+      jtype = map[type[j]];
+      atomj = j;
+
+      delij[0] = xtmp - x[j][0];
+      delij[1] = ytmp - x[j][1];
+      delij[2] = ztmp - x[j][2];
+      rijsq = delij[0]*delij[0] + delij[1]*delij[1] + delij[2]*delij[2];
+
+      // if outside of LJ cutoff, skip
+      // if outside of 4-path cutoff, best = 0.0, no need to test paths
+      // if outside of 2-path cutoff but inside 4-path cutoff,
+      //   best = 0.0, test 3-,4-paths
+      // if inside 2-path cutoff, best = wij, only test 3-,4-paths if best < 1
+
+      if (rijsq >= cutljsq[itype][jtype]) continue;
+      rij = sqrt(rijsq);
+      if (rij >= cut3rebo) {
+        best = 0.0;
+        testpath = 0;
+      } else if (rij >= rcmax[itype][jtype]) {
+        best = 0.0;
+        testpath = 1;
+      } else {
+        best = Sp(rij,rcmin[itype][jtype],rcmax[itype][jtype],dwij);
+        npath = 2;
+        if (best < 1.0) testpath = 1;
+        else testpath = 0;
+      }
+
+      done = 0;
+      if (testpath) {
+
+        // test all 3-body paths = I-K-J
+        // I-K interactions come from atom I's REBO neighbors
+        // if wik > current best, compute wkj
+        // if best = 1.0, done
+
+        REBO_neighs_i = REBO_firstneigh[i];
+        for (kk = 0; kk < REBO_numneigh[i] && done==0; kk++) {
+          k = REBO_neighs_i[kk];
+          if (k == j) continue;
+          ktype = map[type[k]];
+
+          delik[0] = x[i][0] - x[k][0];
+          delik[1] = x[i][1] - x[k][1];
+          delik[2] = x[i][2] - x[k][2];
+          rsq = delik[0]*delik[0] + delik[1]*delik[1] + delik[2]*delik[2];
+          if (rsq < rcmaxsq[itype][ktype]) {
+            rik = sqrt(rsq);
+            wik = Sp(rik,rcmin[itype][ktype],rcmax[itype][ktype],dwik);
+          } else wik = 0.0;
+
+          if (wik > best) {
+            deljk[0] = x[j][0] - x[k][0];
+            deljk[1] = x[j][1] - x[k][1];
+            deljk[2] = x[j][2] - x[k][2];
+            rsq = deljk[0]*deljk[0] + deljk[1]*deljk[1] + deljk[2]*deljk[2];
+            if (rsq < rcmaxsq[ktype][jtype]) {
+              rkj = sqrt(rsq);
+              wkj = Sp(rkj,rcmin[ktype][jtype],rcmax[ktype][jtype],dwkj);
+              if (wik*wkj > best) {
+                best = wik*wkj;
+                npath = 3;
+                 atomk = k;
+                    delikS[0] = delik[0];
+                    delikS[1] = delik[1];
+                    delikS[2] = delik[2];
+                    rikS = rik;
+                    wikS = wik;
+                    dwikS = dwik;
+                    deljkS[0] = deljk[0];
+                    deljkS[1] = deljk[1];
+                    deljkS[2] = deljk[2];
+                    rkjS = rkj;
+                    wkjS = wkj;
+                    dwkjS = dwkj;
+                if (best == 1.0) {
+                  done = 1;
+                  break;
+                }
+              }
+            }
+
+            // test all 4-body paths = I-K-M-J
+            // K-M interactions come from atom K's REBO neighbors
+            // if wik*wkm > current best, compute wmj
+            // if best = 1.0, done
+
+            REBO_neighs_k = REBO_firstneigh[k];
+            for (mm = 0; mm < REBO_numneigh[k] && done==0; mm++) {
+              m = REBO_neighs_k[mm];
+              if (m == i || m == j) continue;
+              mtype = map[type[m]];
+              delkm[0] = x[k][0] - x[m][0];
+              delkm[1] = x[k][1] - x[m][1];
+              delkm[2] = x[k][2] - x[m][2];
+              rsq = delkm[0]*delkm[0] + delkm[1]*delkm[1] + delkm[2]*delkm[2];
+              if (rsq < rcmaxsq[ktype][mtype]) {
+                rkm = sqrt(rsq);
+                wkm = Sp(rkm,rcmin[ktype][mtype],rcmax[ktype][mtype],dwkm);
+              } else wkm = 0.0;
+
+              if (wik*wkm > best) {
+                deljm[0] = x[j][0] - x[m][0];
+                deljm[1] = x[j][1] - x[m][1];
+                deljm[2] = x[j][2] - x[m][2];
+                rsq = deljm[0]*deljm[0] + deljm[1]*deljm[1] +
+                  deljm[2]*deljm[2];
+                if (rsq < rcmaxsq[mtype][jtype]) {
+                  rmj = sqrt(rsq);
+                  wmj = Sp(rmj,rcmin[mtype][jtype],rcmax[mtype][jtype],dwmj);
+                  if (wik*wkm*wmj > best) {
+                    best = wik*wkm*wmj;
+                    npath = 4;
+                    atomk = k;
+                    delikS[0] = delik[0];
+                    delikS[1] = delik[1];
+                    delikS[2] = delik[2];
+                    rikS = rik;
+                    wikS = wik;
+                    dwikS = dwik;
+                    atomm = m;
+                    delkmS[0] = delkm[0];
+                    delkmS[1] = delkm[1];
+                        delkmS[2] = delkm[2];
+                    rkmS = rkm;
+                    wkmS = wkm;
+                    dwkmS = dwkm;
+                    deljmS[0] = deljm[0];
+                    deljmS[1] = deljm[1];
+                       deljmS[2] = deljm[2];
+                    rmjS = rmj;
+                    wmjS = wmj;
+                    dwmjS = dwmj;
+                    if (best == 1.0) {
+                      done = 1;
+                      break;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      cij = 1.0 - best;
+      if (cij == 0.0) continue;
+
+      // compute LJ forces and energy
+
+      sigwid = 0.84;
+      sigcut = 3.0;
+      sigmin = sigcut - sigwid;
+
+      rljmin = sigma[itype][jtype];
+      rljmax = sigcut * rljmin;
+      rljmin = sigmin * rljmin;
+
+      if (rij > rljmax) {
+        slw = 0.0;
+        dslw = 0.0;
+      } else if (rij > rljmin) {
+        drij = rij - rljmin;
+        swidth = rljmax - rljmin;
+        tee = drij / swidth;
+        tee2 = tee*tee;
+        slw = 1.0 - tee2 * (3.0 - 2.0 * tee);
+        dslw = 6.0 * tee * (1.0 - tee) / rij / swidth;
+      } else {
+        slw = 1.0;
+        dslw = 0.0;
+      }
+
+      r2inv = 1.0/rijsq;
+      r6inv = r2inv*r2inv*r2inv;
+
+      vdw = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+      dvdw = -r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]) / rij;
+
+      // VLJ now becomes vdw * slw, derivaties, etc.
+
+      VLJ = vdw * slw;
+      dVLJ = dvdw * slw + vdw * dslw;
+
+      Str = Sp2(rij,rcLJmin[itype][jtype],rcLJmax[itype][jtype],dStr);
+      VA = Str*cij*VLJ;
+      if (Str > 0.0) {
+        scale = rcmin[itype][jtype] / rij;
+        delscale[0] = scale * delij[0];
+        delscale[1] = scale * delij[1];
+        delscale[2] = scale * delij[2];
+        Stb = bondorderLJ_thr(i,j,delscale,rcmin[itype][jtype],VA,
+                              delij,rij,vflag_atom,thr);
+      } else Stb = 0.0;
+
+      fpair = -(dStr * (Stb*cij*VLJ - cij*VLJ) +
+                dVLJ * (Str*Stb*cij + cij - Str*cij)) / rij;
+
+      f[i][0] += delij[0]*fpair;
+      f[i][1] += delij[1]*fpair;
+      f[i][2] += delij[2]*fpair;
+      f[j][0] -= delij[0]*fpair;
+      f[j][1] -= delij[1]*fpair;
+      f[j][2] -= delij[2]*fpair;
+
+      if (eflag) evdwl = VA*Stb + (1.0-Str)*cij*VLJ;
+      if (evflag) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                               evdwl,0.0,fpair,delij[0],delij[1],delij[2],thr);
+
+      if (cij < 1.0) {
+        dC = Str*Stb*VLJ + (1.0-Str)*VLJ;
+        if (npath == 2) {
+          fpair = dC*dwij / rij;
+          f[atomi][0] += delij[0]*fpair;
+          f[atomi][1] += delij[1]*fpair;
+          f[atomi][2] += delij[2]*fpair;
+          f[atomj][0] -= delij[0]*fpair;
+          f[atomj][1] -= delij[1]*fpair;
+          f[atomj][2] -= delij[2]*fpair;
+
+          if (vflag_atom) v_tally2_thr(atomi,atomj,fpair,delij,thr);
+
+        } else if (npath == 3) {
+          fpair1 = dC*dwikS*wkjS / rikS;
+          fi[0] = delikS[0]*fpair1;
+          fi[1] = delikS[1]*fpair1;
+          fi[2] = delikS[2]*fpair1;
+          fpair2 = dC*wikS*dwkjS / rkjS;
+          fj[0] = deljkS[0]*fpair2;
+          fj[1] = deljkS[1]*fpair2;
+          fj[2] = deljkS[2]*fpair2;
+
+          f[atomi][0] += fi[0];
+          f[atomi][1] += fi[1];
+          f[atomi][2] += fi[2];
+          f[atomj][0] += fj[0];
+          f[atomj][1] += fj[1];
+          f[atomj][2] += fj[2];
+          f[atomk][0] -= fi[0] + fj[0];
+          f[atomk][1] -= fi[1] + fj[1];
+          f[atomk][2] -= fi[2] + fj[2];
+
+          if (vflag_atom)
+            v_tally3_thr(atomi,atomj,atomk,fi,fj,delikS,deljkS,thr);
+
+        } else {
+          fpair1 = dC*dwikS*wkmS*wmjS / rikS;
+          fi[0] = delikS[0]*fpair1;
+          fi[1] = delikS[1]*fpair1;
+          fi[2] = delikS[2]*fpair1;
+
+          fpair2 = dC*wikS*dwkmS*wmjS / rkmS;
+          fk[0] = delkmS[0]*fpair2 - fi[0];
+          fk[1] = delkmS[1]*fpair2 - fi[1];
+          fk[2] = delkmS[2]*fpair2 - fi[2];
+
+          fpair3 = dC*wikS*wkmS*dwmjS / rmjS;
+          fj[0] = deljmS[0]*fpair3;
+          fj[1] = deljmS[1]*fpair3;
+          fj[2] = deljmS[2]*fpair3;
+
+          fm[0] = -delkmS[0]*fpair2 - fj[0];
+          fm[1] = -delkmS[1]*fpair2 - fj[1];
+          fm[2] = -delkmS[2]*fpair2 - fj[2];
+
+          f[atomi][0] += fi[0];
+          f[atomi][1] += fi[1];
+          f[atomi][2] += fi[2];
+          f[atomj][0] += fj[0];
+          f[atomj][1] += fj[1];
+          f[atomj][2] += fj[2];
+          f[atomk][0] += fk[0];
+          f[atomk][1] += fk[1];
+          f[atomk][2] += fk[2];
+          f[atomm][0] += fm[0];
+          f[atomm][1] += fm[1];
+          f[atomm][2] += fm[2];
+
+          if (vflag_atom) {
+            delimS[0] = delikS[0] + delkmS[0];
+            delimS[1] = delikS[1] + delkmS[1];
+            delimS[2] = delikS[2] + delkmS[2];
+            v_tally4_thr(atomi,atomj,atomk,atomm,fi,fj,fk,delimS,deljmS,delkmS,thr);
+          }
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   torsional forces and energy
+------------------------------------------------------------------------- */
+
+void PairAIREBOOMP::TORSION_thr(int ifrom, int ito,
+                                int evflag, int eflag, ThrData * const thr)
+{
+  int i,j,k,l,ii,itag,jtag;
+  double evdwl,fpair,xtmp,ytmp,ztmp;
+  double cos321;
+  double w21,dw21,cos234,w34,dw34;
+  double cross321[3],cross321mag,cross234[3],cross234mag;
+  double w23,dw23,cw2,ekijl,Ec;
+  double cw,cwnum,cwnom;
+  double rij,rij2,rik,rjl,tspjik,dtsjik,tspijl,dtsijl,costmp,fcpc;
+  double sin321,sin234,rjk2,rik2,ril2,rjl2;
+  double rjk,ril;
+  double Vtors;
+  double dndij[3],tmpvec[3],dndik[3],dndjl[3];
+  double dcidij,dcidik,dcidjk,dcjdji,dcjdjl,dcjdil;
+  double dsidij,dsidik,dsidjk,dsjdji,dsjdjl,dsjdil;
+  double dxidij,dxidik,dxidjk,dxjdji,dxjdjl,dxjdil;
+  double ddndij,ddndik,ddndjk,ddndjl,ddndil,dcwddn,dcwdn,dvpdcw,Ftmp[3];
+  double del32[3],rsq,r32,del23[3],del21[3],r21;
+  double deljk[3],del34[3],delil[3],delkl[3],r23,r34;
+  double fi[3],fj[3],fk[3],fl[3];
+  int itype,jtype,ktype,ltype,kk,ll,jj;
+  int *ilist,*REBO_neighs_i,*REBO_neighs_j;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int *type = atom->type;
+  int *tag = atom->tag;
+
+  ilist = list->ilist;
+
+  for (ii = ifrom; ii < ito; ii++) {
+    i = ilist[ii];
+    itag = tag[i];
+    itype = map[type[i]];
+    if (itype != 0) continue;
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    REBO_neighs_i = REBO_firstneigh[i];
+
+    for (jj = 0; jj < REBO_numneigh[i]; jj++) {
+      j = REBO_neighs_i[jj];
+      jtag = tag[j];
+
+      if (itag > jtag) {
+        if ((itag+jtag) % 2 == 0) continue;
+      } else if (itag < jtag) {
+        if ((itag+jtag) % 2 == 1) continue;
+      } else {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp && x[j][1] < ytmp) continue;
+        if (x[j][2] == ztmp && x[j][1] == ytmp && x[j][0] < xtmp) continue;
+      }
+
+      jtype = map[type[j]];
+      if (jtype != 0) continue;
+
+      del32[0] = x[j][0]-x[i][0];
+      del32[1] = x[j][1]-x[i][1];
+      del32[2] = x[j][2]-x[i][2];
+      rsq = del32[0]*del32[0] + del32[1]*del32[1] + del32[2]*del32[2];
+      r32 = sqrt(rsq);
+      del23[0] = -del32[0];
+      del23[1] = -del32[1];
+      del23[2] = -del32[2];
+      r23 = r32;
+      w23 = Sp(r23,rcmin[itype][jtype],rcmax[itype][jtype],dw23);
+
+      for (kk = 0; kk < REBO_numneigh[i]; kk++) {
+        k = REBO_neighs_i[kk];
+        ktype = map[type[k]];
+        if (k == j) continue;
+        del21[0] = x[i][0]-x[k][0];
+        del21[1] = x[i][1]-x[k][1];
+        del21[2] = x[i][2]-x[k][2];
+        rsq = del21[0]*del21[0] + del21[1]*del21[1] + del21[2]*del21[2];
+        r21 = sqrt(rsq);
+        cos321 = - ((del21[0]*del32[0]) + (del21[1]*del32[1]) +
+                    (del21[2]*del32[2])) / (r21*r32);
+        cos321 = MIN(cos321,1.0);
+        cos321 = MAX(cos321,-1.0);
+        sin321 = sqrt(1.0 - cos321*cos321);
+        if (sin321 < TOL) continue;
+
+        deljk[0] = del21[0]-del23[0];
+        deljk[1] = del21[1]-del23[1];
+        deljk[2] = del21[2]-del23[2];
+        rjk2 = deljk[0]*deljk[0] + deljk[1]*deljk[1] + deljk[2]*deljk[2];
+        rjk=sqrt(rjk2);
+        rik2 = r21*r21;
+        w21 = Sp(r21,rcmin[itype][ktype],rcmax[itype][ktype],dw21);
+
+        rij = r32;
+        rik = r21;
+        rij2 = r32*r32;
+        rik2 = r21*r21;
+        costmp = 0.5*(rij2+rik2-rjk2)/rij/rik;
+        tspjik = Sp2(costmp,thmin,thmax,dtsjik);
+        dtsjik = -dtsjik;
+
+        REBO_neighs_j = REBO_firstneigh[j];
+        for (ll = 0; ll < REBO_numneigh[j]; ll++) {
+          l = REBO_neighs_j[ll];
+          ltype = map[type[l]];
+          if (l == i || l == k) continue;
+          del34[0] = x[j][0]-x[l][0];
+          del34[1] = x[j][1]-x[l][1];
+          del34[2] = x[j][2]-x[l][2];
+          rsq = del34[0]*del34[0] + del34[1]*del34[1] + del34[2]*del34[2];
+          r34 = sqrt(rsq);
+          cos234 = (del32[0]*del34[0] + del32[1]*del34[1] +
+                    del32[2]*del34[2]) / (r32*r34);
+          cos234 = MIN(cos234,1.0);
+          cos234 = MAX(cos234,-1.0);
+          sin234 = sqrt(1.0 - cos234*cos234);
+          if (sin234 < TOL) continue;
+          w34 = Sp(r34,rcmin[jtype][ltype],rcmax[jtype][ltype],dw34);
+          delil[0] = del23[0] + del34[0];
+          delil[1] = del23[1] + del34[1];
+          delil[2] = del23[2] + del34[2];
+          ril2 = delil[0]*delil[0] + delil[1]*delil[1] + delil[2]*delil[2];
+          ril=sqrt(ril2);
+          rjl2 = r34*r34;
+
+          rjl = r34;
+          rjl2 = r34*r34;
+          costmp = 0.5*(rij2+rjl2-ril2)/rij/rjl;
+          tspijl = Sp2(costmp,thmin,thmax,dtsijl);
+          dtsijl = -dtsijl; //need minus sign
+          cross321[0] = (del32[1]*del21[2])-(del32[2]*del21[1]);
+          cross321[1] = (del32[2]*del21[0])-(del32[0]*del21[2]);
+          cross321[2] = (del32[0]*del21[1])-(del32[1]*del21[0]);
+          cross321mag = sqrt(cross321[0]*cross321[0]+
+                             cross321[1]*cross321[1]+
+                             cross321[2]*cross321[2]);
+          cross234[0] = (del23[1]*del34[2])-(del23[2]*del34[1]);
+          cross234[1] = (del23[2]*del34[0])-(del23[0]*del34[2]);
+          cross234[2] = (del23[0]*del34[1])-(del23[1]*del34[0]);
+          cross234mag = sqrt(cross234[0]*cross234[0]+
+                             cross234[1]*cross234[1]+
+                             cross234[2]*cross234[2]);
+          cwnum = (cross321[0]*cross234[0]) +
+            (cross321[1]*cross234[1])+(cross321[2]*cross234[2]);
+          cwnom = r21*r34*r32*r32*sin321*sin234;
+          cw = cwnum/cwnom;
+
+          cw2 = (.5*(1.0-cw));
+          ekijl = epsilonT[ktype][ltype];
+          Ec = 256.0*ekijl/405.0;
+          Vtors = (Ec*(pow(cw2,5.0)))-(ekijl/10.0);
+
+          if (eflag) evdwl = Vtors*w21*w23*w34*(1.0-tspjik)*(1.0-tspijl);
+
+          dndij[0] = (cross234[1]*del21[2])-(cross234[2]*del21[1]);
+          dndij[1] = (cross234[2]*del21[0])-(cross234[0]*del21[2]);
+          dndij[2] = (cross234[0]*del21[1])-(cross234[1]*del21[0]);
+
+          tmpvec[0] = (del34[1]*cross321[2])-(del34[2]*cross321[1]);
+          tmpvec[1] = (del34[2]*cross321[0])-(del34[0]*cross321[2]);
+          tmpvec[2] = (del34[0]*cross321[1])-(del34[1]*cross321[0]);
+
+          dndij[0] = dndij[0]+tmpvec[0];
+          dndij[1] = dndij[1]+tmpvec[1];
+          dndij[2] = dndij[2]+tmpvec[2];
+
+          dndik[0] = (del23[1]*cross234[2])-(del23[2]*cross234[1]);
+          dndik[1] = (del23[2]*cross234[0])-(del23[0]*cross234[2]);
+          dndik[2] = (del23[0]*cross234[1])-(del23[1]*cross234[0]);
+
+          dndjl[0] = (cross321[1]*del23[2])-(cross321[2]*del23[1]);
+          dndjl[1] = (cross321[2]*del23[0])-(cross321[0]*del23[2]);
+          dndjl[2] = (cross321[0]*del23[1])-(cross321[1]*del23[0]);
+
+          dcidij = ((r23*r23)-(r21*r21)+(rjk*rjk))/(2.0*r23*r23*r21);
+          dcidik = ((r21*r21)-(r23*r23)+(rjk*rjk))/(2.0*r23*r21*r21);
+          dcidjk = (-rjk)/(r23*r21);
+          dcjdji = ((r23*r23)-(r34*r34)+(ril*ril))/(2.0*r23*r23*r34);
+          dcjdjl = ((r34*r34)-(r23*r23)+(ril*ril))/(2.0*r23*r34*r34);
+          dcjdil = (-ril)/(r23*r34);
+
+          dsidij = (-cos321/sin321)*dcidij;
+          dsidik = (-cos321/sin321)*dcidik;
+          dsidjk = (-cos321/sin321)*dcidjk;
+
+          dsjdji = (-cos234/sin234)*dcjdji;
+          dsjdjl = (-cos234/sin234)*dcjdjl;
+          dsjdil = (-cos234/sin234)*dcjdil;
+
+          dxidij = (r21*sin321)+(r23*r21*dsidij);
+          dxidik = (r23*sin321)+(r23*r21*dsidik);
+          dxidjk = (r23*r21*dsidjk);
+
+          dxjdji = (r34*sin234)+(r23*r34*dsjdji);
+          dxjdjl = (r23*sin234)+(r23*r34*dsjdjl);
+          dxjdil = (r23*r34*dsjdil);
+
+          ddndij = (dxidij*cross234mag)+(cross321mag*dxjdji);
+          ddndik = dxidik*cross234mag;
+          ddndjk = dxidjk*cross234mag;
+          ddndjl = cross321mag*dxjdjl;
+          ddndil = cross321mag*dxjdil;
+          dcwddn = -cwnum/(cwnom*cwnom);
+          dcwdn = 1.0/cwnom;
+          dvpdcw = (-1.0)*Ec*(-.5)*5.0*pow(cw2,4.0) *
+            w23*w21*w34*(1.0-tspjik)*(1.0-tspijl);
+
+          Ftmp[0] = dvpdcw*((dcwdn*dndij[0])+(dcwddn*ddndij*del23[0]/r23));
+          Ftmp[1] = dvpdcw*((dcwdn*dndij[1])+(dcwddn*ddndij*del23[1]/r23));
+          Ftmp[2] = dvpdcw*((dcwdn*dndij[2])+(dcwddn*ddndij*del23[2]/r23));
+          fi[0] = Ftmp[0];
+          fi[1] = Ftmp[1];
+          fi[2] = Ftmp[2];
+          fj[0] = -Ftmp[0];
+          fj[1] = -Ftmp[1];
+          fj[2] = -Ftmp[2];
+
+          Ftmp[0] = dvpdcw*((dcwdn*dndik[0])+(dcwddn*ddndik*del21[0]/r21));
+          Ftmp[1] = dvpdcw*((dcwdn*dndik[1])+(dcwddn*ddndik*del21[1]/r21));
+          Ftmp[2] = dvpdcw*((dcwdn*dndik[2])+(dcwddn*ddndik*del21[2]/r21));
+          fi[0] += Ftmp[0];
+          fi[1] += Ftmp[1];
+          fi[2] += Ftmp[2];
+          fk[0] = -Ftmp[0];
+          fk[1] = -Ftmp[1];
+          fk[2] = -Ftmp[2];
+
+          Ftmp[0] = (dvpdcw*dcwddn*ddndjk*deljk[0])/rjk;
+          Ftmp[1] = (dvpdcw*dcwddn*ddndjk*deljk[1])/rjk;
+          Ftmp[2] = (dvpdcw*dcwddn*ddndjk*deljk[2])/rjk;
+          fj[0] += Ftmp[0];
+          fj[1] += Ftmp[1];
+          fj[2] += Ftmp[2];
+          fk[0] -= Ftmp[0];
+          fk[1] -= Ftmp[1];
+          fk[2] -= Ftmp[2];
+
+          Ftmp[0] = dvpdcw*((dcwdn*dndjl[0])+(dcwddn*ddndjl*del34[0]/r34));
+          Ftmp[1] = dvpdcw*((dcwdn*dndjl[1])+(dcwddn*ddndjl*del34[1]/r34));
+          Ftmp[2] = dvpdcw*((dcwdn*dndjl[2])+(dcwddn*ddndjl*del34[2]/r34));
+          fj[0] += Ftmp[0];
+          fj[1] += Ftmp[1];
+          fj[2] += Ftmp[2];
+          fl[0] = -Ftmp[0];
+          fl[1] = -Ftmp[1];
+          fl[2] = -Ftmp[2];
+
+          Ftmp[0] = (dvpdcw*dcwddn*ddndil*delil[0])/ril;
+          Ftmp[1] = (dvpdcw*dcwddn*ddndil*delil[1])/ril;
+          Ftmp[2] = (dvpdcw*dcwddn*ddndil*delil[2])/ril;
+          fi[0] += Ftmp[0];
+          fi[1] += Ftmp[1];
+          fi[2] += Ftmp[2];
+          fl[0] -= Ftmp[0];
+          fl[1] -= Ftmp[1];
+          fl[2] -= Ftmp[2];
+
+          // coordination forces
+
+          fpair = Vtors*dw21*w23*w34*(1.0-tspjik)*(1.0-tspijl) / r21;
+          fi[0] -= del21[0]*fpair;
+          fi[1] -= del21[1]*fpair;
+          fi[2] -= del21[2]*fpair;
+          fk[0] += del21[0]*fpair;
+          fk[1] += del21[1]*fpair;
+          fk[2] += del21[2]*fpair;
+
+          fpair = Vtors*w21*dw23*w34*(1.0-tspjik)*(1.0-tspijl) / r23;
+          fi[0] -= del23[0]*fpair;
+          fi[1] -= del23[1]*fpair;
+          fi[2] -= del23[2]*fpair;
+          fj[0] += del23[0]*fpair;
+          fj[1] += del23[1]*fpair;
+          fj[2] += del23[2]*fpair;
+
+          fpair = Vtors*w21*w23*dw34*(1.0-tspjik)*(1.0-tspijl) / r34;
+          fj[0] -= del34[0]*fpair;
+          fj[1] -= del34[1]*fpair;
+          fj[2] -= del34[2]*fpair;
+          fl[0] += del34[0]*fpair;
+          fl[1] += del34[1]*fpair;
+          fl[2] += del34[2]*fpair;
+
+          // additional cut off function forces
+
+          fcpc = -Vtors*w21*w23*w34*dtsjik*(1.0-tspijl);
+          fpair = fcpc*dcidij/rij;
+          fi[0] += fpair*del23[0];
+          fi[1] += fpair*del23[1];
+          fi[2] += fpair*del23[2];
+          fj[0] -= fpair*del23[0];
+          fj[1] -= fpair*del23[1];
+          fj[2] -= fpair*del23[2];
+
+          fpair = fcpc*dcidik/rik;
+          fi[0] += fpair*del21[0];
+          fi[1] += fpair*del21[1];
+          fi[2] += fpair*del21[2];
+          fk[0] -= fpair*del21[0];
+          fk[1] -= fpair*del21[1];
+          fk[2] -= fpair*del21[2];
+
+          fpair = fcpc*dcidjk/rjk;
+          fj[0] += fpair*deljk[0];
+          fj[1] += fpair*deljk[1];
+          fj[2] += fpair*deljk[2];
+          fk[0] -= fpair*deljk[0];
+          fk[1] -= fpair*deljk[1];
+          fk[2] -= fpair*deljk[2];
+
+          fcpc = -Vtors*w21*w23*w34*(1.0-tspjik)*dtsijl;
+          fpair = fcpc*dcjdji/rij;
+          fi[0] += fpair*del23[0];
+          fi[1] += fpair*del23[1];
+          fi[2] += fpair*del23[2];
+          fj[0] -= fpair*del23[0];
+          fj[1] -= fpair*del23[1];
+          fj[2] -= fpair*del23[2];
+
+          fpair = fcpc*dcjdjl/rjl;
+          fj[0] += fpair*del34[0];
+          fj[1] += fpair*del34[1];
+          fj[2] += fpair*del34[2];
+          fl[0] -= fpair*del34[0];
+          fl[1] -= fpair*del34[1];
+          fl[2] -= fpair*del34[2];
+
+          fpair = fcpc*dcjdil/ril;
+          fi[0] += fpair*delil[0];
+          fi[1] += fpair*delil[1];
+          fi[2] += fpair*delil[2];
+          fl[0] -= fpair*delil[0];
+          fl[1] -= fpair*delil[1];
+          fl[2] -= fpair*delil[2];
+
+          // sum per-atom forces into atom force array
+
+          f[i][0] += fi[0]; f[i][1] += fi[1]; f[i][2] += fi[2];
+          f[j][0] += fj[0]; f[j][1] += fj[1]; f[j][2] += fj[2];
+          f[k][0] += fk[0]; f[k][1] += fk[1]; f[k][2] += fk[2];
+          f[l][0] += fl[0]; f[l][1] += fl[1]; f[l][2] += fl[2];
+
+          if (evflag) {
+            delkl[0] = delil[0] - del21[0];
+            delkl[1] = delil[1] - del21[1];
+            delkl[2] = delil[2] - del21[2];
+            ev_tally4_thr(this,i,j,k,l,evdwl,fi,fj,fk,delil,del34,delkl,thr);
+          }
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create REBO neighbor list from main neighbor list
+   REBO neighbor list stores neighbors of ghost atoms
+------------------------------------------------------------------------- */
+
+void PairAIREBOOMP::REBO_neigh_thr()
+{
+  const int nlocal = atom->nlocal;
+  const int nthreads = comm->nthreads;
+
+  if (atom->nmax > maxlocal) {
+    maxlocal = atom->nmax;
+    memory->destroy(REBO_numneigh);
+    memory->sfree(REBO_firstneigh);
+    memory->destroy(nC);
+    memory->destroy(nH);
+    memory->create(REBO_numneigh,maxlocal,"AIREBO:numneigh");
+    REBO_firstneigh = (int **) memory->smalloc(maxlocal*sizeof(int *),
+                                               "AIREBO:firstneigh");
+    memory->create(nC,maxlocal,"AIREBO:nC");
+    memory->create(nH,maxlocal,"AIREBO:nH");
+  }
+
+  if (nthreads > maxpage)
+    add_pages(nthreads - maxpage);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int i,j,ii,jj,n,jnum,itype,jtype;
+    double xtmp,ytmp,ztmp,delx,dely,delz,rsq,dS;
+    int *ilist,*jlist,*numneigh,**firstneigh;
+    int *neighptr;
+
+    double **x = atom->x;
+    int *type = atom->type;
+
+    const int allnum = list->inum + list->gnum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    const int iidelta = 1 + allnum/nthreads;
+    const int iifrom = tid*iidelta;
+    const int iito = ((iifrom+iidelta)>allnum) ? allnum : (iifrom+iidelta);
+
+    // store all REBO neighs of owned and ghost atoms
+    // scan full neighbor list of I
+
+    int npage = tid;
+    int npnt = 0;
+
+    for (ii = iifrom; ii < iito; ii++) {
+      i = ilist[ii];
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      if (pgsize - npnt < oneatom) {
+        npnt = 0;
+        npage += nthreads;
+        if (npage >= maxpage) add_pages(nthreads);
+      }
+      neighptr = &(pages[npage][npnt]);
+      n = 0;
+
+      xtmp = x[i][0];
+      ytmp = x[i][1];
+      ztmp = x[i][2];
+      itype = map[type[i]];
+      nC[i] = nH[i] = 0.0;
+      jlist = firstneigh[i];
+      jnum = numneigh[i];
+
+      for (jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+        j &= NEIGHMASK;
+        jtype = map[type[j]];
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if (rsq < rcmaxsq[itype][jtype]) {
+          neighptr[n++] = j;
+          if (jtype == 0)
+            nC[i] += Sp(sqrt(rsq),rcmin[itype][jtype],rcmax[itype][jtype],dS);
+          else
+            nH[i] += Sp(sqrt(rsq),rcmin[itype][jtype],rcmax[itype][jtype],dS);
+        }
+      }
+
+      REBO_firstneigh[i] = neighptr;
+      REBO_numneigh[i] = n;
+      npnt += n;
+
+      if (npnt >= pgsize)
+        error->one(FLERR,"REBO list overflow, boost neigh_modify one or page");
+    }
+  }
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+double PairAIREBOOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairAIREBO::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_airebo_omp.h b/src/USER-OMP/pair_airebo_omp.h
new file mode 100644
index 0000000000..03bc5e5f8c
--- /dev/null
+++ b/src/USER-OMP/pair_airebo_omp.h
@@ -0,0 +1,53 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(airebo/omp,PairAIREBOOMP)
+
+#else
+
+#ifndef LMP_PAIR_AIREBO_OMP_H
+#define LMP_PAIR_AIREBO_OMP_H
+
+#include "pair_airebo.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairAIREBOOMP : public PairAIREBO, public ThrOMP {
+
+ public:
+  PairAIREBOOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  double bondorder_thr(int i, int j, double rij[3], double rijmag,
+                       double VA, int vflag_atom, ThrData * const thr);
+  double bondorderLJ_thr(int i, int j, double rij[3], double rijmag,
+                         double VA, double rij0[3], double rijmag0,
+                         int vflag_atom, ThrData * const thr);
+
+  void FREBO_thr(int ifrom, int ito, int evflag, int eflag,
+                 int vflag_atom, ThrData * const thr);
+  void FLJ_thr(int ifrom, int ito, int evflag, int eflag,
+               int vflag_atom, ThrData * const thr);
+  void TORSION_thr(int ifrom, int ito, int evflag, int eflag, ThrData * const thr);
+  void REBO_neigh_thr();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_beck_omp.cpp b/src/USER-OMP/pair_beck_omp.cpp
new file mode 100644
index 0000000000..9bec6c0db2
--- /dev/null
+++ b/src/USER-OMP/pair_beck_omp.cpp
@@ -0,0 +1,173 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_beck_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairBeckOMP::PairBeckOMP(LAMMPS *lmp) :
+  PairBeck(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBeckOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBeckOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r5,force_beck,factor_lj;
+  double r,rinv;
+  double aaij,alphaij,betaij;
+  double term1,term1inv,term2,term3,term4,term5,term6;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        r5 = rsq*rsq*r;
+        aaij = aa[itype][jtype];
+        alphaij = alpha[itype][jtype];
+        betaij = beta[itype][jtype];
+        term1 = aaij*aaij + rsq;
+        term2 = 1.0/pow(term1,5.0);
+        term3 = 21.672 + 30.0*aaij*aaij + 6.0*rsq;
+        term4 = alphaij + r5*betaij;
+        term5 = alphaij + 6.0*r5*betaij;
+        rinv  = 1.0/r;
+        force_beck = AA[itype][jtype]*exp(-1.0*r*term4)*term5;
+        force_beck -= BB[itype][jtype]*r*term2*term3;
+
+        fpair = factor_lj*force_beck*rinv;
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          term6 = 1.0/pow(term1,3.0);
+          term1inv = 1.0/term1;
+          evdwl = AA[itype][jtype]*exp(-1.0*r*term4);
+          evdwl -= BB[itype][jtype]*term6*(1.0+(2.709+3.0*aaij*aaij)*term1inv);
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBeckOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBeck::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_beck_omp.h b/src/USER-OMP/pair_beck_omp.h
new file mode 100644
index 0000000000..18a4401bbc
--- /dev/null
+++ b/src/USER-OMP/pair_beck_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(beck/omp,PairBeckOMP)
+
+#else
+
+#ifndef LMP_PAIR_BECK_OMP_H
+#define LMP_PAIR_BECK_OMP_H
+
+#include "pair_beck.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBeckOMP : public PairBeck, public ThrOMP {
+
+ public:
+  PairBeckOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_born_coul_long_omp.cpp b/src/USER-OMP/pair_born_coul_long_omp.cpp
new file mode 100644
index 0000000000..f627f19bac
--- /dev/null
+++ b/src/USER-OMP/pair_born_coul_long_omp.cpp
@@ -0,0 +1,197 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_born_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairBornCoulLongOMP::PairBornCoulLongOMP(LAMMPS *lmp) :
+  PairBornCoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBornCoulLongOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,r,rexp,forcecoul,forceborn,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+
+        if (rsq < cut_coulsq) {
+          grij = g_ewald * r;
+          expm2 = exp(-grij*grij);
+          t = 1.0 / (1.0 + EWALD_P*grij);
+          erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*q[j]/r;
+          forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+          if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          rexp = exp((sigma[itype][jtype]-r)*rhoinv[itype][jtype]);
+          forceborn = born1[itype][jtype]*r*rexp - born2[itype][jtype]*r6inv
+            + born3[itype][jtype]*r2inv*r6inv;
+        } else forceborn = 0.0;
+
+        fpair = (forcecoul + factor_lj*forceborn)*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = prefactor*erfc;
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv
+              + d[itype][jtype]*r6inv*r2inv - offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBornCoulLongOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBornCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_born_coul_long_omp.h b/src/USER-OMP/pair_born_coul_long_omp.h
new file mode 100644
index 0000000000..bdf81e514e
--- /dev/null
+++ b/src/USER-OMP/pair_born_coul_long_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(born/coul/long/omp,PairBornCoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_BORN_COUL_LONG_OMP_H
+#define LMP_PAIR_BORN_COUL_LONG_OMP_H
+
+#include "pair_born_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBornCoulLongOMP : public PairBornCoulLong, public ThrOMP {
+
+ public:
+  PairBornCoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_born_coul_wolf_omp.cpp b/src/USER-OMP/pair_born_coul_wolf_omp.cpp
new file mode 100644
index 0000000000..35e0ac7443
--- /dev/null
+++ b/src/USER-OMP/pair_born_coul_wolf_omp.cpp
@@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_born_coul_wolf_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairBornCoulWolfOMP::PairBornCoulWolfOMP(LAMMPS *lmp) :
+  PairBornCoulWolf(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulWolfOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBornCoulWolfOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forceborn,factor_coul,factor_lj;
+  double prefactor;
+  double r,rexp;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double erfcc,erfcd,v_sh,dvdrr,e_self,e_shift,f_shift,qisq;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  // self and shifted coulombic energy
+
+  e_self = v_sh = 0.0;
+  e_shift = erfc(alf*cut_coul)/cut_coul;
+  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) /
+    cut_coul;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    qisq = qtmp*qtmp;
+    e_self = -(e_shift/2.0 + alf/MY_PIS) * qisq*qqrd2e;
+    if (EVFLAG) ev_tally_thr(this,i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0,thr);
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+
+        if (rsq < cut_coulsq) {
+          prefactor = qqrd2e*qtmp*q[j]/r;
+          erfcc = erfc(alf*r);
+          erfcd = exp(-alf*alf*r*r);
+          v_sh = (erfcc - e_shift*r) * prefactor;
+          dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = dvdrr*rsq*prefactor;
+          if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          rexp = exp((sigma[itype][jtype]-r)*rhoinv[itype][jtype]);
+          forceborn = born1[itype][jtype]*r*rexp - born2[itype][jtype]*r6inv
+            + born3[itype][jtype]*r2inv*r6inv;
+        } else forceborn = 0.0;
+
+        fpair = (forcecoul + factor_lj*forceborn)*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = v_sh;
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv +
+              d[itype][jtype]*r6inv*r2inv - offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBornCoulWolfOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBornCoulWolf::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_born_coul_wolf_omp.h b/src/USER-OMP/pair_born_coul_wolf_omp.h
new file mode 100644
index 0000000000..d714b32d29
--- /dev/null
+++ b/src/USER-OMP/pair_born_coul_wolf_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(born/coul/wolf/omp,PairBornCoulWolfOMP)
+
+#else
+
+#ifndef LMP_PAIR_BORN_COUL_WOLF_OMP_H
+#define LMP_PAIR_BORN_COUL_WOLF_OMP_H
+
+#include "pair_born_coul_wolf.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBornCoulWolfOMP : public PairBornCoulWolf, public ThrOMP {
+
+ public:
+  PairBornCoulWolfOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_born_omp.cpp b/src/USER-OMP/pair_born_omp.cpp
new file mode 100644
index 0000000000..847efaa013
--- /dev/null
+++ b/src/USER-OMP/pair_born_omp.cpp
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_born_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairBornOMP::PairBornOMP(LAMMPS *lmp) :
+  PairBorn(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBornOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,r,rexp,forceborn,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        r = sqrt(rsq);
+        rexp = exp((sigma[itype][jtype]-r)*rhoinv[itype][jtype]);
+        forceborn = born1[itype][jtype]*r*rexp - born2[itype][jtype]*r6inv
+          + born3[itype][jtype]*r2inv*r6inv;
+        fpair = factor_lj*forceborn*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv
+            + d[itype][jtype]*r6inv*r2inv - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBornOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBorn::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_born_omp.h b/src/USER-OMP/pair_born_omp.h
new file mode 100644
index 0000000000..db3c768f7a
--- /dev/null
+++ b/src/USER-OMP/pair_born_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(born/omp,PairBornOMP)
+
+#else
+
+#ifndef LMP_PAIR_BORN_OMP_H
+#define LMP_PAIR_BORN_OMP_H
+
+#include "pair_born.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBornOMP : public PairBorn, public ThrOMP {
+
+ public:
+  PairBornOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_brownian_omp.cpp b/src/USER-OMP/pair_brownian_omp.cpp
new file mode 100644
index 0000000000..561c82abf5
--- /dev/null
+++ b/src/USER-OMP/pair_brownian_omp.cpp
@@ -0,0 +1,398 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_brownian_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "input.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+#include "variable.h"
+#include "random_mars.h"
+#include "math_const.h"
+
+#include "fix_wall.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define EPSILON 1.0e-10
+
+// same as fix_wall.cpp
+
+enum{EDGE,CONSTANT,VARIABLE};
+
+/* ---------------------------------------------------------------------- */
+
+PairBrownianOMP::PairBrownianOMP(LAMMPS *lmp) :
+  PairBrownian(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  random_thr = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairBrownianOMP::~PairBrownianOMP()
+{
+  if (random_thr) {
+    for (int i=1; i < comm->nthreads; ++i)
+      delete random_thr[i];
+
+    delete[] random_thr;
+    random_thr = NULL;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBrownianOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // This section of code adjusts R0/RT0/RS0 if necessary due to changes
+  // in the volume fraction as a result of fix deform or moving walls
+
+  double dims[3], wallcoord;
+  if (flagVF) // Flag for volume fraction corrections
+    if (flagdeform || flagwall == 2){ // Possible changes in volume fraction
+      if (flagdeform && !flagwall)
+        for (int j = 0; j < 3; j++)
+          dims[j] = domain->prd[j];
+      else if (flagwall == 2 || (flagdeform && flagwall == 1)){
+        double wallhi[3], walllo[3];
+        for (int j = 0; j < 3; j++){
+          wallhi[j] = domain->prd[j];
+          walllo[j] = 0;
+        }
+        for (int m = 0; m < wallfix->nwall; m++){
+          int dim = wallfix->wallwhich[m] / 2;
+          int side = wallfix->wallwhich[m] % 2;
+          if (wallfix->wallstyle[m] == VARIABLE){
+            wallcoord = input->variable->compute_equal(wallfix->varindex[m]);
+          }
+          else wallcoord = wallfix->coord0[m];
+          if (side == 0) walllo[dim] = wallcoord;
+          else wallhi[dim] = wallcoord;
+        }
+        for (int j = 0; j < 3; j++)
+          dims[j] = wallhi[j] - walllo[j];
+      }
+      double vol_T = dims[0]*dims[1]*dims[2];
+      double vol_f = vol_P/vol_T;
+      if (flaglog == 0) {
+        R0  = 6*MY_PI*mu*rad*(1.0 + 2.16*vol_f);
+        RT0 = 8*MY_PI*mu*pow(rad,3.0);
+        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.33*vol_f + 2.80*vol_f*vol_f);
+      } else {
+        R0  = 6*MY_PI*mu*rad*(1.0 + 2.725*vol_f - 6.583*vol_f*vol_f);
+        RT0 = 8*MY_PI*mu*pow(rad,3.0)*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
+        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.64*vol_f - 6.95*vol_f*vol_f);
+      }
+    }
+
+
+  if (!random_thr)
+    random_thr = new RanMars*[nthreads];
+
+  // to ensure full compatibility with the serial Brownian style
+  // we use is random number generator instance for thread 0
+  random_thr[0] = random;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // generate a random number generator instance for
+    // all threads != 0. make sure we use unique seeds.
+    if (random_thr && tid > 0)
+      random_thr[tid] = new RanMars(Pair::lmp, seed + comm->me
+                                    + comm->nprocs*tid);
+    if (flaglog) {
+      if (evflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (evflag) {
+        if (force->newton_pair) eval<0,1,1>(ifrom, ito, thr);
+        else eval<0,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+        else eval<0,0,0>(ifrom, ito, thr);
+      }
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int FLAGLOG, int EVFLAG, int NEWTON_PAIR>
+void PairBrownianOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz,tx,ty,tz;
+  double rsq,r,h_sep,radi;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const double * const radius = atom->radius;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  RanMars &rng = *random_thr[thr->get_tid()];
+
+  double vxmu2f = force->vxmu2f;
+  double randr;
+  double prethermostat;
+  double xl[3],a_sq,a_sh,a_pu,Fbmag;
+  double p1[3],p2[3],p3[3];
+  int overlaps = 0;
+
+  // scale factor for Brownian moments
+
+  prethermostat = sqrt(24.0*force->boltz*t_target/update->dt);
+  prethermostat *= sqrt(force->vxmu2f/force->ftm2v/force->mvv2e);
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    radi = radius[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    // FLD contribution to force and torque due to isotropic terms
+
+    if (flagfld) {
+      f[i][0] += prethermostat*sqrt(R0)*(rng.uniform()-0.5);
+      f[i][1] += prethermostat*sqrt(R0)*(rng.uniform()-0.5);
+      f[i][2] += prethermostat*sqrt(R0)*(rng.uniform()-0.5);
+      if (FLAGLOG) {
+        torque[i][0] += prethermostat*sqrt(RT0)*(rng.uniform()-0.5);
+        torque[i][1] += prethermostat*sqrt(RT0)*(rng.uniform()-0.5);
+        torque[i][2] += prethermostat*sqrt(RT0)*(rng.uniform()-0.5);
+      }
+    }
+
+    if (!flagHI) continue;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+
+        // scalar resistances a_sq and a_sh
+
+        h_sep = r - 2.0*radi;
+
+        // check for overlaps
+
+        if (h_sep < 0.0) overlaps++;
+
+        // if less than minimum gap, use minimum gap instead
+
+        if (r < cut_inner[itype][jtype])
+          h_sep = cut_inner[itype][jtype] - 2.0*radi;
+
+        // scale h_sep by radi
+
+        h_sep = h_sep/radi;
+
+        // scalar resistances
+
+        if (FLAGLOG) {
+          a_sq = 6.0*MY_PI*mu*radi*(1.0/4.0/h_sep + 9.0/40.0*log(1.0/h_sep));
+          a_sh = 6.0*MY_PI*mu*radi*(1.0/6.0*log(1.0/h_sep));
+          a_pu = 8.0*MY_PI*mu*pow(radi,3.0)*(3.0/160.0*log(1.0/h_sep));
+        } else
+          a_sq = 6.0*MY_PI*mu*radi*(1.0/4.0/h_sep);
+
+        // generate the Pairwise Brownian Force: a_sq
+
+        Fbmag = prethermostat*sqrt(a_sq);
+
+        // generate a random number
+
+        randr = rng.uniform()-0.5;
+
+        // contribution due to Brownian motion
+
+        fx = Fbmag*randr*delx/r;
+        fy = Fbmag*randr*dely/r;
+        fz = Fbmag*randr*delz/r;
+
+        // add terms due to a_sh
+
+        if (FLAGLOG) {
+
+          // generate two orthogonal vectors to the line of centers
+
+          p1[0] = delx/r; p1[1] = dely/r; p1[2] = delz/r;
+          set_3_orthogonal_vectors(p1,p2,p3);
+
+          // magnitude
+
+          Fbmag = prethermostat*sqrt(a_sh);
+
+          // force in each of the two directions
+
+          randr = rng.uniform()-0.5;
+          fx += Fbmag*randr*p2[0];
+          fy += Fbmag*randr*p2[1];
+          fz += Fbmag*randr*p2[2];
+
+          randr = rng.uniform()-0.5;
+          fx += Fbmag*randr*p3[0];
+          fy += Fbmag*randr*p3[1];
+          fz += Fbmag*randr*p3[2];
+        }
+
+        // scale forces to appropriate units
+
+        fx = vxmu2f*fx;
+        fy = vxmu2f*fy;
+        fz = vxmu2f*fz;
+
+        // sum to total force
+
+        f[i][0] -= fx;
+        f[i][1] -= fy;
+        f[i][2] -= fz;
+
+        if (NEWTON_PAIR || j < nlocal) {
+          //randr = rng.uniform()-0.5;
+          //fx = Fbmag*randr*delx/r;
+          //fy = Fbmag*randr*dely/r;
+          //fz = Fbmag*randr*delz/r;
+
+          f[j][0] += fx;
+          f[j][1] += fy;
+          f[j][2] += fz;
+        }
+
+        // torque due to the Brownian Force
+
+        if (FLAGLOG) {
+
+          // location of the point of closest approach on I from its center
+
+          xl[0] = -delx/r*radi;
+          xl[1] = -dely/r*radi;
+          xl[2] = -delz/r*radi;
+
+          // torque = xl_cross_F
+
+          tx = xl[1]*fz - xl[2]*fy;
+          ty = xl[2]*fx - xl[0]*fz;
+          tz = xl[0]*fy - xl[1]*fx;
+
+          // torque is same on both particles
+
+          torque[i][0] -= tx;
+          torque[i][1] -= ty;
+          torque[i][2] -= tz;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            torque[j][0] -= tx;
+            torque[j][1] -= ty;
+            torque[j][2] -= tz;
+          }
+
+          // torque due to a_pu
+
+          Fbmag = prethermostat*sqrt(a_pu);
+
+          // force in each direction
+
+          randr = rng.uniform()-0.5;
+          tx = Fbmag*randr*p2[0];
+          ty = Fbmag*randr*p2[1];
+          tz = Fbmag*randr*p2[2];
+
+          randr = rng.uniform()-0.5;
+          tx += Fbmag*randr*p3[0];
+          ty += Fbmag*randr*p3[1];
+          tz += Fbmag*randr*p3[2];
+
+          // torque has opposite sign on two particles
+
+          torque[i][0] -= tx;
+          torque[i][1] -= ty;
+          torque[i][2] -= tz;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            torque[j][0] += tx;
+            torque[j][1] += ty;
+            torque[j][2] += tz;
+          }
+        }
+
+        if (EVFLAG) ev_tally_xyz(i,j,nlocal,NEWTON_PAIR,
+                                 0.0,0.0,-fx,-fy,-fz,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBrownianOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBrownian::memory_usage();
+  bytes += comm->nthreads * sizeof(RanMars*);
+  bytes += comm->nthreads * sizeof(RanMars);
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_brownian_omp.h b/src/USER-OMP/pair_brownian_omp.h
new file mode 100644
index 0000000000..cdabe08ba1
--- /dev/null
+++ b/src/USER-OMP/pair_brownian_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(brownian/omp,PairBrownianOMP)
+
+#else
+
+#ifndef LMP_PAIR_BROWNIAN_OMP_H
+#define LMP_PAIR_BROWNIAN_OMP_H
+
+#include "pair_brownian.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBrownianOMP : public PairBrownian, public ThrOMP {
+
+ public:
+  PairBrownianOMP(class LAMMPS *);
+  virtual ~PairBrownianOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  class RanMars **random_thr;
+
+ private:
+  template <int LOGFLAG, int EVFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_brownian_poly_omp.cpp b/src/USER-OMP/pair_brownian_poly_omp.cpp
new file mode 100644
index 0000000000..6affbf96cf
--- /dev/null
+++ b/src/USER-OMP/pair_brownian_poly_omp.cpp
@@ -0,0 +1,387 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_brownian_poly_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "input.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+#include "variable.h"
+#include "random_mars.h"
+#include "fix_wall.h"
+
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define EPSILON 1.0e-10
+
+// same as fix_wall.cpp
+
+enum{EDGE,CONSTANT,VARIABLE};
+
+/* ---------------------------------------------------------------------- */
+
+PairBrownianPolyOMP::PairBrownianPolyOMP(LAMMPS *lmp) :
+  PairBrownianPoly(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  random_thr = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairBrownianPolyOMP::~PairBrownianPolyOMP()
+{
+  if (random_thr) {
+    for (int i=1; i < comm->nthreads; ++i)
+      delete random_thr[i];
+
+    delete[] random_thr;
+    random_thr = NULL;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBrownianPolyOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // This section of code adjusts R0/RT0/RS0 if necessary due to changes
+  // in the volume fraction as a result of fix deform or moving walls
+
+  double dims[3], wallcoord;
+  if (flagVF) // Flag for volume fraction corrections
+    if (flagdeform || flagwall == 2){ // Possible changes in volume fraction
+      if (flagdeform && !flagwall)
+        for (int j = 0; j < 3; j++)
+          dims[j] = domain->prd[j];
+      else if (flagwall == 2 || (flagdeform && flagwall == 1)){
+        double wallhi[3], walllo[3];
+        for (int j = 0; j < 3; j++){
+          wallhi[j] = domain->prd[j];
+          walllo[j] = 0;
+        }
+        for (int m = 0; m < wallfix->nwall; m++){
+          int dim = wallfix->wallwhich[m] / 2;
+          int side = wallfix->wallwhich[m] % 2;
+          if (wallfix->wallstyle[m] == VARIABLE){
+            wallcoord = input->variable->compute_equal(wallfix->varindex[m]);
+          }
+          else wallcoord = wallfix->coord0[m];
+          if (side == 0) walllo[dim] = wallcoord;
+          else wallhi[dim] = wallcoord;
+        }
+        for (int j = 0; j < 3; j++)
+          dims[j] = wallhi[j] - walllo[j];
+      }
+      double vol_T = dims[0]*dims[1]*dims[2];
+      double vol_f = vol_P/vol_T;
+      if (flaglog == 0) {
+        R0  = 6*MY_PI*mu*rad*(1.0 + 2.16*vol_f);
+        RT0 = 8*MY_PI*mu*pow(rad,3.0);
+        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.33*vol_f + 2.80*vol_f*vol_f);
+      } else {
+        R0  = 6*MY_PI*mu*rad*(1.0 + 2.725*vol_f - 6.583*vol_f*vol_f);
+        RT0 = 8*MY_PI*mu*pow(rad,3.0)*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
+        //RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3)*(1.0 + 3.64*vol_f - 6.95*vol_f*vol_f);
+      }
+    }
+
+
+  if (!random_thr)
+    random_thr = new RanMars*[nthreads];
+
+  // to ensure full compatibility with the serial BrownianPoly style
+  // we use is random number generator instance for thread 0
+  random_thr[0] = random;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // generate a random number generator instance for
+    // all threads != 0. make sure we use unique seeds.
+    if (random_thr && tid > 0)
+      random_thr[tid] = new RanMars(Pair::lmp, seed + comm->me
+                                    + comm->nprocs*tid);
+    if (flaglog) {
+      if (evflag)
+        eval<1,1>(ifrom, ito, thr);
+      else
+        eval<1,0>(ifrom, ito, thr);
+    } else {
+      if (evflag)
+        eval<0,1>(ifrom, ito, thr);
+      else eval<0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int FLAGLOG, int EVFLAG>
+void PairBrownianPolyOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz,tx,ty,tz;
+  double rsq,r,h_sep,beta0,beta1,radi,radj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const double * const radius = atom->radius;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  RanMars &rng = *random_thr[thr->get_tid()];
+
+  double vxmu2f = force->vxmu2f;
+  int overlaps = 0;
+  double randr;
+  double prethermostat;
+  double xl[3],a_sq,a_sh,a_pu,Fbmag;
+  double p1[3],p2[3],p3[3];
+
+  // scale factor for Brownian moments
+
+  prethermostat = sqrt(24.0*force->boltz*t_target/update->dt);
+  prethermostat *= sqrt(force->vxmu2f/force->ftm2v/force->mvv2e);
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    radi = radius[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    // FLD contribution to force and torque due to isotropic terms
+
+    if (flagfld) {
+      f[i][0] += prethermostat*sqrt(R0*radi)*(rng.uniform()-0.5);
+      f[i][1] += prethermostat*sqrt(R0*radi)*(rng.uniform()-0.5);
+      f[i][2] += prethermostat*sqrt(R0*radi)*(rng.uniform()-0.5);
+      if (FLAGLOG) {
+        const double rad3 = radi*radi*radi;
+        torque[i][0] += prethermostat*sqrt(RT0*rad3)*(rng.uniform()-0.5);
+        torque[i][1] += prethermostat*sqrt(RT0*rad3)*(rng.uniform()-0.5);
+        torque[i][2] += prethermostat*sqrt(RT0*rad3)*(rng.uniform()-0.5);
+      }
+    }
+
+    if (!flagHI) continue;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+      radj = radius[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+
+        // scalar resistances a_sq and a_sh
+
+        h_sep = r - radi-radj;
+
+        // check for overlaps
+
+        if (h_sep < 0.0) overlaps++;
+
+        // if less than minimum gap, use minimum gap instead
+
+        if (r < cut_inner[itype][jtype])
+          h_sep = cut_inner[itype][jtype] - radi-radj;
+
+        // scale h_sep by radi
+
+        h_sep = h_sep/radi;
+        beta0 = radj/radi;
+        beta1 = 1.0 + beta0;
+
+        // scalar resistances
+
+        if (FLAGLOG) {
+          a_sq = beta0*beta0/beta1/beta1/h_sep +
+            (1.0+7.0*beta0+beta0*beta0)/5.0/pow(beta1,3.0)*log(1.0/h_sep);
+          a_sq += (1.0+18.0*beta0-29.0*beta0*beta0+18.0*pow(beta0,3.0) +
+                   pow(beta0,4.0))/21.0/pow(beta1,4.0)*h_sep*log(1.0/h_sep);
+          a_sq *= 6.0*MY_PI*mu*radi;
+          a_sh = 4.0*beta0*(2.0+beta0+2.0*beta0*beta0)/15.0/pow(beta1,3.0) *
+            log(1.0/h_sep);
+          a_sh += 4.0*(16.0-45.0*beta0+58.0*beta0*beta0-45.0*pow(beta0,3.0) +
+                       16.0*pow(beta0,4.0))/375.0/pow(beta1,4.0) *
+            h_sep*log(1.0/h_sep);
+          a_sh *= 6.0*MY_PI*mu*radi;
+          a_pu = beta0*(4.0+beta0)/10.0/beta1/beta1*log(1.0/h_sep);
+          a_pu += (32.0-33.0*beta0+83.0*beta0*beta0+43.0 *
+                   pow(beta0,3.0))/250.0/pow(beta1,3.0)*h_sep*log(1.0/h_sep);
+          a_pu *= 8.0*MY_PI*mu*pow(radi,3.0);
+
+        } else a_sq = 6.0*MY_PI*mu*radi*(beta0*beta0/beta1/beta1/h_sep);
+
+        // generate the Pairwise Brownian Force: a_sq
+
+        Fbmag = prethermostat*sqrt(a_sq);
+
+        // generate a random number
+
+        randr = rng.uniform()-0.5;
+
+        // contribution due to Brownian motion
+
+        fx = Fbmag*randr*delx/r;
+        fy = Fbmag*randr*dely/r;
+        fz = Fbmag*randr*delz/r;
+
+        // add terms due to a_sh
+
+        if (FLAGLOG) {
+
+          // generate two orthogonal vectors to the line of centers
+
+          p1[0] = delx/r; p1[1] = dely/r; p1[2] = delz/r;
+          set_3_orthogonal_vectors(p1,p2,p3);
+
+          // magnitude
+
+          Fbmag = prethermostat*sqrt(a_sh);
+
+          // force in each of the two directions
+
+          randr = rng.uniform()-0.5;
+          fx += Fbmag*randr*p2[0];
+          fy += Fbmag*randr*p2[1];
+          fz += Fbmag*randr*p2[2];
+
+          randr = rng.uniform()-0.5;
+          fx += Fbmag*randr*p3[0];
+          fy += Fbmag*randr*p3[1];
+          fz += Fbmag*randr*p3[2];
+        }
+
+        // scale forces to appropriate units
+
+        fx = vxmu2f*fx;
+        fy = vxmu2f*fy;
+        fz = vxmu2f*fz;
+
+        // sum to total force
+
+        f[i][0] -= fx;
+        f[i][1] -= fy;
+        f[i][2] -= fz;
+
+        // torque due to the Brownian Force
+
+        if (FLAGLOG) {
+
+          // location of the point of closest approach on I from its center
+
+          xl[0] = -delx/r*radi;
+          xl[1] = -dely/r*radi;
+          xl[2] = -delz/r*radi;
+
+          // torque = xl_cross_F
+
+          tx = xl[1]*fz - xl[2]*fy;
+          ty = xl[2]*fx - xl[0]*fz;
+          tz = xl[0]*fy - xl[1]*fx;
+
+          // torque is same on both particles
+
+          torque[i][0] -= tx;
+          torque[i][1] -= ty;
+          torque[i][2] -= tz;
+
+          // torque due to a_pu
+
+          Fbmag = prethermostat*sqrt(a_pu);
+
+          // force in each direction
+
+          randr = rng.uniform()-0.5;
+          tx = Fbmag*randr*p2[0];
+          ty = Fbmag*randr*p2[1];
+          tz = Fbmag*randr*p2[2];
+
+          randr = rng.uniform()-0.5;
+          tx += Fbmag*randr*p3[0];
+          ty += Fbmag*randr*p3[1];
+          tz += Fbmag*randr*p3[2];
+
+          // torque has opposite sign on two particles
+
+          torque[i][0] -= tx;
+          torque[i][1] -= ty;
+          torque[i][2] -= tz;
+
+        }
+
+        // set j = nlocal so that only I gets tallied
+
+        if (EVFLAG) ev_tally_xyz(i,nlocal,nlocal,/* newton_pair */ 0,
+                                 0.0,0.0,-fx,-fy,-fz,delx,dely,delz);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBrownianPolyOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBrownianPoly::memory_usage();
+  bytes += comm->nthreads * sizeof(RanMars*);
+  bytes += comm->nthreads * sizeof(RanMars);
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_brownian_poly_omp.h b/src/USER-OMP/pair_brownian_poly_omp.h
new file mode 100644
index 0000000000..663696c6c6
--- /dev/null
+++ b/src/USER-OMP/pair_brownian_poly_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(brownian/poly/omp,PairBrownianPolyOMP)
+
+#else
+
+#ifndef LMP_PAIR_BROWNIAN_POLY_OMP_H
+#define LMP_PAIR_BROWNIAN_POLY_OMP_H
+
+#include "pair_brownian_poly.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBrownianPolyOMP : public PairBrownianPoly, public ThrOMP {
+
+ public:
+  PairBrownianPolyOMP(class LAMMPS *);
+  virtual ~PairBrownianPolyOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  class RanMars **random_thr;
+
+ private:
+  template <int LOGFLAG, int EVFLAG>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_buck_coul_cut_omp.cpp b/src/USER-OMP/pair_buck_coul_cut_omp.cpp
new file mode 100644
index 0000000000..8b570ff1a0
--- /dev/null
+++ b/src/USER-OMP/pair_buck_coul_cut_omp.cpp
@@ -0,0 +1,179 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_buck_coul_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckCoulCutOMP::PairBuckCoulCutOMP(LAMMPS *lmp) :
+  PairBuckCoulCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBuckCoulCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,r,rexp,forcecoul,forcebuck,factor_coul,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+
+        if (rsq < cut_coulsq[itype][jtype])
+          forcecoul = qqrd2e * qtmp*q[j]/r;
+        else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          rexp = exp(-r*rhoinv[itype][jtype]);
+          forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
+        } else forcebuck = 0.0;
+
+        fpair = (forcecoul + factor_lj*forcebuck)*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq[itype][jtype])
+            ecoul = factor_coul * qqrd2e * qtmp*q[j]/r;
+          else ecoul = 0.0;
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBuckCoulCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBuckCoulCut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_buck_coul_cut_omp.h b/src/USER-OMP/pair_buck_coul_cut_omp.h
new file mode 100644
index 0000000000..cf142b022a
--- /dev/null
+++ b/src/USER-OMP/pair_buck_coul_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/coul/cut/omp,PairBuckCoulCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_COUL_CUT_OMP_H
+#define LMP_PAIR_BUCK_COUL_CUT_OMP_H
+
+#include "pair_buck_coul_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCoulCutOMP : public PairBuckCoulCut, public ThrOMP {
+
+ public:
+  PairBuckCoulCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_buck_coul_long_omp.cpp b/src/USER-OMP/pair_buck_coul_long_omp.cpp
new file mode 100644
index 0000000000..85f2b48db9
--- /dev/null
+++ b/src/USER-OMP/pair_buck_coul_long_omp.cpp
@@ -0,0 +1,197 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_buck_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckCoulLongOMP::PairBuckCoulLongOMP(LAMMPS *lmp) :
+  PairBuckCoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    // reduce per thread forces into global force array.
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBuckCoulLongOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,r,rexp,forcecoul,forcebuck,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+
+        if (rsq < cut_coulsq) {
+          grij = g_ewald * r;
+          expm2 = exp(-grij*grij);
+          t = 1.0 / (1.0 + EWALD_P*grij);
+          erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*q[j]/r;
+          forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+          if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          rexp = exp(-r*rhoinv[itype][jtype]);
+          forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
+        } else forcebuck = 0.0;
+
+        fpair = (forcecoul + factor_lj*forcebuck)*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = prefactor*erfc;
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBuckCoulLongOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBuckCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_buck_coul_long_omp.h b/src/USER-OMP/pair_buck_coul_long_omp.h
new file mode 100644
index 0000000000..1309bc4c4b
--- /dev/null
+++ b/src/USER-OMP/pair_buck_coul_long_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/coul/long/omp,PairBuckCoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_COUL_LONG_OMP_H
+#define LMP_PAIR_BUCK_COUL_LONG_OMP_H
+
+#include "pair_buck_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCoulLongOMP : public PairBuckCoulLong, public ThrOMP {
+
+ public:
+  PairBuckCoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_buck_coul_omp.cpp b/src/USER-OMP/pair_buck_coul_omp.cpp
new file mode 100644
index 0000000000..d1e54f1b11
--- /dev/null
+++ b/src/USER-OMP/pair_buck_coul_omp.cpp
@@ -0,0 +1,229 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_buck_coul_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "math_vector.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckCoulOMP::PairBuckCoulOMP(LAMMPS *lmp) :
+  PairBuckCoul(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBuckCoulOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  double evdwl,ecoul,fpair;
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  const double *x0 = x[0];
+  double *f0 = f[0], *fi = f0;
+
+  int *ilist = list->ilist;
+
+  // loop over neighbors of my atoms
+
+  int i, ii, j, order1 = ewald_order&(1<<1), order6 = ewald_order&(1<<6);
+  int *jneigh, *jneighn, typei, typej, ni;
+  double qi, qri, *cutsqi, *cut_bucksqi,
+         *buck1i, *buck2i, *buckai, *buckci, *rhoinvi, *offseti;
+  double r, rsq, r2inv, force_coul, force_buck;
+  double g2 = g_ewald*g_ewald, g6 = g2*g2*g2, g8 = g6*g2;
+  vector xi, d;
+
+  for (ii = iifrom; ii < iito; ++ii) {                        // loop over my atoms
+    i = ilist[ii]; fi = f0+3*i;
+    if (order1) qri = (qi = q[i])*qqrd2e;                // initialize constants
+    offseti = offset[typei = type[i]];
+    buck1i = buck1[typei]; buck2i = buck2[typei];
+    buckai = buck_a[typei]; buckci = buck_c[typei], rhoinvi = rhoinv[typei];
+    cutsqi = cutsq[typei]; cut_bucksqi = cut_bucksq[typei];
+    memcpy(xi, x0+(i+(i<<1)), sizeof(vector));
+    jneighn = (jneigh = list->firstneigh[i])+list->numneigh[i];
+
+    for (; jneigh<jneighn; ++jneigh) {                        // loop over neighbors
+      j = *jneigh;
+      ni = sbmask(j);
+      j &= NEIGHMASK;
+
+      { const register double *xj = x0+(j+(j<<1));
+        d[0] = xi[0] - xj[0];                                // pair vector
+        d[1] = xi[1] - xj[1];
+        d[2] = xi[2] - xj[2]; }
+
+      if ((rsq = vec_dot(d, d)) >= cutsqi[typej = type[j]]) continue;
+      r2inv = 1.0/rsq;
+      r = sqrt(rsq);
+
+      if (order1 && (rsq < cut_coulsq)) {                // coulombic
+        if (!ncoultablebits || rsq <= tabinnersq) {        // series real space
+          register double x = g_ewald*r;
+          register double s = qri*q[j], t = 1.0/(1.0+EWALD_P*x);
+          if (ni == 0) {
+            s *= g_ewald*exp(-x*x);
+            force_coul = (t *= ((((t*A5+A4)*t+A3)*t+A2)*t+A1)*s/x)+EWALD_F*s;
+            if (EFLAG) ecoul = t;
+          } else {                                        // special case
+            register double f = s*(1.0-special_coul[ni])/r;
+            s *= g_ewald*exp(-x*x);
+            force_coul = (t *= ((((t*A5+A4)*t+A3)*t+A2)*t+A1)*s/x)+EWALD_F*s-f;
+            if (EFLAG) ecoul = t-f;
+          }                                        // table real space
+        } else {
+          register union_int_float_t t;
+          t.f = rsq;
+          register const int k = (t.i & ncoulmask) >> ncoulshiftbits;
+          register double f = (rsq-rtable[k])*drtable[k], qiqj = qi*q[j];
+          if (ni == 0) {
+            force_coul = qiqj*(ftable[k]+f*dftable[k]);
+            if (EFLAG) ecoul = qiqj*(etable[k]+f*detable[k]);
+          }
+          else {                                        // special case
+            t.f = (1.0-special_coul[ni])*(ctable[k]+f*dctable[k]);
+            force_coul = qiqj*(ftable[k]+f*dftable[k]-t.f);
+            if (EFLAG) ecoul = qiqj*(etable[k]+f*detable[k]-t.f);
+          }
+        }
+      } else force_coul = ecoul = 0.0;
+
+      if (rsq < cut_bucksqi[typej]) {                        // buckingham
+        register double rn = r2inv*r2inv*r2inv,
+                        expr = exp(-r*rhoinvi[typej]);
+        if (order6) {                                        // long-range
+          register double x2 = g2*rsq, a2 = 1.0/x2;
+          x2 = a2*exp(-x2)*buckci[typej];
+          if (ni == 0) {
+            force_buck =
+              r*expr*buck1i[typej]-g8*(((6.0*a2+6.0)*a2+3.0)*a2+1.0)*x2*rsq;
+            if (EFLAG) evdwl = expr*buckai[typej]-g6*((a2+1.0)*a2+0.5)*x2;
+          } else {                                        // special case
+            register double f = special_lj[ni], t = rn*(1.0-f);
+            force_buck = f*r*expr*buck1i[typej]-
+              g8*(((6.0*a2+6.0)*a2+3.0)*a2+1.0)*x2*rsq+t*buck2i[typej];
+            if (EFLAG) evdwl = f*expr*buckai[typej] -
+                         g6*((a2+1.0)*a2+0.5)*x2+t*buckci[typej];
+          }
+        } else {                                                // cut
+          if (ni == 0) {
+            force_buck = r*expr*buck1i[typej]-rn*buck2i[typej];
+            if (EFLAG) evdwl = expr*buckai[typej] -
+                         rn*buckci[typej]-offseti[typej];
+          } else {                                        // special case
+            register double f = special_lj[ni];
+            force_buck = f*(r*expr*buck1i[typej]-rn*buck2i[typej]);
+            if (EFLAG)
+              evdwl = f*(expr*buckai[typej]-rn*buckci[typej]-offseti[typej]);
+          }
+        }
+      } else force_buck = evdwl = 0.0;
+
+      fpair = (force_coul+force_buck)*r2inv;
+
+      if (NEWTON_PAIR || j < nlocal) {
+        register double *fj = f0+(j+(j<<1)), f;
+        fi[0] += f = d[0]*fpair; fj[0] -= f;
+        fi[1] += f = d[1]*fpair; fj[1] -= f;
+        fi[2] += f = d[2]*fpair; fj[2] -= f;
+      } else {
+        fi[0] += d[0]*fpair;
+        fi[1] += d[1]*fpair;
+        fi[2] += d[2]*fpair;
+      }
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                               evdwl,ecoul,fpair,d[0],d[1],d[2],thr);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBuckCoulOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBuckCoul::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_buck_coul_omp.h b/src/USER-OMP/pair_buck_coul_omp.h
new file mode 100644
index 0000000000..9fc11818a6
--- /dev/null
+++ b/src/USER-OMP/pair_buck_coul_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/coul/omp,PairBuckCoulOMP)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_COUL_OMP_H
+#define LMP_PAIR_BUCK_COUL_OMP_H
+
+#include "pair_buck_coul.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCoulOMP : public PairBuckCoul, public ThrOMP {
+
+ public:
+  PairBuckCoulOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_buck_omp.cpp b/src/USER-OMP/pair_buck_omp.cpp
new file mode 100644
index 0000000000..8b32e2cfa5
--- /dev/null
+++ b/src/USER-OMP/pair_buck_omp.cpp
@@ -0,0 +1,163 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_buck_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckOMP::PairBuckOMP(LAMMPS *lmp) :
+  PairBuck(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairBuckOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,r,rexp,forcebuck,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        r = sqrt(rsq);
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        r = sqrt(rsq);
+        rexp = exp(-r*rhoinv[itype][jtype]);
+        forcebuck = buck1[itype][jtype]*r*rexp - buck2[itype][jtype]*r6inv;
+        fpair = factor_lj*forcebuck*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = a[itype][jtype]*rexp - c[itype][jtype]*r6inv -
+            offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairBuckOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairBuck::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_buck_omp.h b/src/USER-OMP/pair_buck_omp.h
new file mode 100644
index 0000000000..72e1178803
--- /dev/null
+++ b/src/USER-OMP/pair_buck_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/omp,PairBuckOMP)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_OMP_H
+#define LMP_PAIR_BUCK_OMP_H
+
+#include "pair_buck.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckOMP : public PairBuck, public ThrOMP {
+
+ public:
+  PairBuckOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_cdeam_omp.cpp b/src/USER-OMP/pair_cdeam_omp.cpp
new file mode 100644
index 0000000000..bec3ce4b25
--- /dev/null
+++ b/src/USER-OMP/pair_cdeam_omp.cpp
@@ -0,0 +1,531 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "string.h"
+
+#include "pair_cdeam_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+// This is for debugging purposes. The ASSERT() macro is used in the code to check
+// if everything runs as expected. Change this to #if 0 if you don't need the checking.
+#if 0
+        #define ASSERT(cond) ((!(cond)) ? my_failure(error,__FILE__,__LINE__) : my_noop())
+
+        inline void my_noop() {}
+        inline void my_failure(Error* error, const char* file, int line) {
+                char str[1024];
+                sprintf(str,"Assertion failure: File %s, line %i", file, line);
+                error->one(FLERR,str);
+        }
+#else
+        #define ASSERT(cond)
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PairCDEAMOMP::PairCDEAMOMP(LAMMPS *lmp, int _cdeamVersion) :
+  PairEAM(lmp), PairCDEAM(lmp,_cdeamVersion), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCDEAMOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // grow energy and fp arrays if necessary
+  // need to be atom->nmax in length
+
+  if (atom->nmax > nmax) {
+    memory->destroy(rho);
+    memory->destroy(rhoB);
+    memory->destroy(D_values);
+    memory->destroy(fp);
+    nmax = atom->nmax;
+    memory->create(rho,nthreads*nmax,"pair:rho");
+    memory->create(rhoB,nthreads*nmax,"pair:mu");
+    memory->create(D_values,nthreads*nmax,"pair:D_values");
+    memory->create(fp,nmax,"pair:fp");
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (force->newton_pair)
+      thr->init_cdeam(nall, rho, rhoB, D_values);
+    else
+      thr->init_cdeam(atom->nlocal, rho, rhoB, D_values);
+
+    switch (cdeamVersion) {
+
+    case 1:
+
+      if (evflag) {
+        if (eflag) {
+          if (force->newton_pair) eval<1,1,1,1>(ifrom, ito, thr);
+          else eval<1,1,0,1>(ifrom, ito, thr);
+        } else {
+          if (force->newton_pair) eval<1,0,1,1>(ifrom, ito, thr);
+          else eval<1,0,0,1>(ifrom, ito, thr);
+        }
+      } else {
+        if (force->newton_pair) eval<0,0,1,1>(ifrom, ito, thr);
+        else eval<0,0,0,1>(ifrom, ito, thr);
+      }
+      break;
+
+    case 2:
+
+      if (evflag) {
+        if (eflag) {
+          if (force->newton_pair) eval<1,1,1,2>(ifrom, ito, thr);
+          else eval<1,1,0,2>(ifrom, ito, thr);
+        } else {
+          if (force->newton_pair) eval<1,0,1,2>(ifrom, ito, thr);
+          else eval<1,0,0,2>(ifrom, ito, thr);
+        }
+      } else {
+        if (force->newton_pair) eval<0,0,1,2>(ifrom, ito, thr);
+        else eval<0,0,0,2>(ifrom, ito, thr);
+      }
+      break;
+
+    default:
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    error->all(FLERR,"unsupported eam/cd pair style variant");
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, int CDEAMVERSION>
+void PairCDEAMOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,rhoip,rhojp,recip,phi;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const rho_t = thr->get_rho();
+  double * const rhoB_t = thr->get_rhoB();
+  double * const D_values_t = thr->get_D_values();
+  const int tid = thr->get_tid();
+  const int nthreads = comm->nthreads;
+
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // Stage I
+
+  // Compute rho and rhoB at each local atom site.
+  // Additionally calculate the D_i values here if we are using the one-site formulation.
+  // For the two-site formulation we have to calculate the D values in an extra loop (Stage II).
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if(rsq < cutforcesq) {
+        jtype = type[j];
+        double r = sqrt(rsq);
+        const EAMTableIndex index = radiusToTableIndex(r);
+        double localrho = RhoOfR(index, jtype, itype);
+        rho_t[i] += localrho;
+        if(jtype == speciesB) rhoB_t[i] += localrho;
+        if(NEWTON_PAIR || j < nlocal) {
+          localrho = RhoOfR(index, itype, jtype);
+          rho_t[j] += localrho;
+          if(itype == speciesB) rhoB_t[j] += localrho;
+        }
+
+        if(CDEAMVERSION == 1 && itype != jtype) {
+          // Note: if the i-j interaction is not concentration dependent (because either
+          // i or j are not species A or B) then its contribution to D_i and D_j should
+          // be ignored.
+          // This if-clause is only required for a ternary.
+          if((itype == speciesA && jtype == speciesB)
+             || (jtype == speciesA && itype == speciesB)) {
+            double Phi_AB = PhiOfR(index, itype, jtype, 1.0 / r);
+            D_values_t[i] += Phi_AB;
+            if(NEWTON_PAIR || j < nlocal)
+              D_values_t[j] += Phi_AB;
+          }
+        }
+      }
+    }
+  }
+
+  // wait until all threads are done with computation
+  sync_threads();
+
+  // communicate and sum densities
+
+  if (NEWTON_PAIR) {
+    // reduce per thread density
+    data_reduce_thr(rho, nall, nthreads, 1, tid);
+    data_reduce_thr(rhoB, nall, nthreads, 1, tid);
+    if (CDEAMVERSION==1)
+      data_reduce_thr(D_values, nall, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    { communicationStage = 1;
+      comm->reverse_comm_pair(this); }
+
+    // wait until master thread is done with communication
+    sync_threads();
+
+  } else {
+    // reduce per thread density
+    data_reduce_thr(rho, nlocal, nthreads, 1, tid);
+    data_reduce_thr(rhoB, nlocal, nthreads, 1, tid);
+    if (CDEAMVERSION==1)
+      data_reduce_thr(D_values, nlocal, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+  }
+
+  // fp = derivative of embedding energy at each atom
+  // phi = embedding energy at each atom
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    EAMTableIndex index = rhoToTableIndex(rho[i]);
+    fp[i] = FPrimeOfRho(index, type[i]);
+    if(EFLAG) {
+      phi = FofRho(index, type[i]);
+      e_tally_thr(this, i, i, nlocal, NEWTON_PAIR, phi, 0.0, thr);
+    }
+  }
+
+  // wait until all theads are done with computation
+  sync_threads();
+
+  // Communicate derivative of embedding function and densities
+  // and D_values (this for one-site formulation only).
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+  { communicationStage = 2;
+    comm->forward_comm_pair(this); }
+
+  // wait until master thread is done with communication
+  sync_threads();
+
+
+  // The electron densities may not drop to zero because then the concentration would no longer be defined.
+  // But the concentration is not needed anyway if there is no interaction with another atom, which is the case
+  // if the electron density is exactly zero. That's why the following lines have been commented out.
+  //
+  //for(i = 0; i < nlocal + atom->nghost; i++) {
+  //        if(rho[i] == 0 && (type[i] == speciesA || type[i] == speciesB))
+  //                error->one(FLERR,"CD-EAM potential routine: Detected atom with zero electron density.");
+  //}
+
+  // Stage II
+  // This is only required for the original two-site formulation of the CD-EAM potential.
+
+  if(CDEAMVERSION == 2) {
+    // Compute intermediate value D_i for each atom.
+    for (ii = iifrom; ii < iito; ii++) {
+      i = ilist[ii];
+      xtmp = x[i][0];
+      ytmp = x[i][1];
+      ztmp = x[i][2];
+      itype = type[i];
+      jlist = firstneigh[i];
+      jnum = numneigh[i];
+
+      // This code line is required for ternary alloys.
+      if(itype != speciesA && itype != speciesB) continue;
+
+      double x_i = rhoB[i] / rho[i];        // Concentration at atom i.
+
+      for(jj = 0; jj < jnum; jj++) {
+        j = jlist[jj];
+        j &= NEIGHMASK;
+        jtype = type[j];
+        if(itype == jtype) continue;
+
+        // This code line is required for ternary alloys.
+        if(jtype != speciesA && jtype != speciesB) continue;
+
+        delx = xtmp - x[j][0];
+        dely = ytmp - x[j][1];
+        delz = ztmp - x[j][2];
+        rsq = delx*delx + dely*dely + delz*delz;
+
+        if(rsq < cutforcesq) {
+          double r = sqrt(rsq);
+          const EAMTableIndex index = radiusToTableIndex(r);
+
+          // The concentration independent part of the cross pair potential.
+          double Phi_AB = PhiOfR(index, itype, jtype, 1.0 / r);
+
+          // Average concentration of two sites
+          double x_ij = 0.5 * (x_i + rhoB[j]/rho[j]);
+
+          // Calculate derivative of h(x_ij) polynomial function.
+          double h_prime = evalHprime(x_ij);
+
+          D_values_t[i] += h_prime * Phi_AB / (2.0 * rho[i] * rho[i]);
+          if(NEWTON_PAIR || j < nlocal)
+            D_values_t[j] += h_prime * Phi_AB / (2.0 * rho[j] * rho[j]);
+        }
+      }
+    }
+
+    if (NEWTON_PAIR) {
+      data_reduce_thr(D_values, nall, nthreads, 1, tid);
+
+      // wait until reduction is complete
+      sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+      { communicationStage = 3;
+        comm->reverse_comm_pair(this); }
+
+      // wait until master thread is done with communication
+      sync_threads();
+
+  } else {
+      data_reduce_thr(D_values, nlocal, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+  }
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    { communicationStage = 4;
+      comm->forward_comm_pair(this); }
+
+    // wait until master thread is done with communication
+    sync_threads();
+  }
+
+  // Stage III
+
+  // Compute force acting on each atom.
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    // Concentration at site i
+    double x_i = -1.0;                // The value -1 indicates: no concentration dependence for all interactions of atom i.
+    // It will be replaced by the concentration at site i if atom i is either A or B.
+
+    double D_i, h_prime_i;
+
+    // This if-clause is only required for ternary alloys.
+    if((itype == speciesA || itype == speciesB) && rho[i] != 0.0) {
+
+      // Compute local concentration at site i.
+      x_i = rhoB[i]/rho[i];
+      ASSERT(x_i >= 0 && x_i<=1.0);
+
+      if(CDEAMVERSION == 1) {
+        // Calculate derivative of h(x_i) polynomial function.
+        h_prime_i = evalHprime(x_i);
+        D_i = D_values[i] * h_prime_i / (2.0 * rho[i] * rho[i]);
+      } else if(CDEAMVERSION == 2) {
+        D_i = D_values[i];
+      } else ASSERT(false);
+    }
+
+    for(jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if(rsq < cutforcesq) {
+        jtype = type[j];
+        double r = sqrt(rsq);
+        const EAMTableIndex index = radiusToTableIndex(r);
+
+        // rhoip = derivative of (density at atom j due to atom i)
+        // rhojp = derivative of (density at atom i due to atom j)
+        // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+        //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+        //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+        rhoip = RhoPrimeOfR(index, itype, jtype);
+        rhojp = RhoPrimeOfR(index, jtype, itype);
+        fpair = fp[i]*rhojp + fp[j]*rhoip;
+        recip = 1.0/r;
+
+        double x_j = -1;  // The value -1 indicates: no concentration dependence for this i-j pair
+        // because atom j is not of species A nor B.
+
+        // This code line is required for ternary alloy.
+        if(jtype == speciesA || jtype == speciesB) {
+          ASSERT(rho[i] != 0.0);
+          ASSERT(rho[j] != 0.0);
+
+          // Compute local concentration at site j.
+          x_j = rhoB[j]/rho[j];
+          ASSERT(x_j >= 0 && x_j<=1.0);
+
+          double D_j;
+          if(CDEAMVERSION == 1) {
+            // Calculate derivative of h(x_j) polynomial function.
+            double h_prime_j = evalHprime(x_j);
+            D_j = D_values[j] * h_prime_j / (2.0 * rho[j] * rho[j]);
+          } else if(CDEAMVERSION == 2) {
+            D_j = D_values[j];
+          } else ASSERT(false);
+
+          double t2 = -rhoB[j];
+          if(itype == speciesB) t2 += rho[j];
+          fpair += D_j * rhoip * t2;
+        }
+
+        // This if-clause is only required for a ternary alloy.
+        // Actually we don't need it at all because D_i should be zero anyway if
+        // atom i has no concentration dependent interactions (because it is not species A or B).
+        if(x_i != -1.0) {
+          double t1 = -rhoB[i];
+          if(jtype == speciesB) t1 += rho[i];
+          fpair += D_i * rhojp * t1;
+        }
+
+        double phip;
+        double phi = PhiOfR(index, itype, jtype, recip, phip);
+        if(itype == jtype || x_i == -1.0 || x_j == -1.0) {
+          // Case of no concentration dependence.
+          fpair += phip;
+        } else {
+          // We have a concentration dependence for the i-j interaction.
+          double h;
+          if(CDEAMVERSION == 1) {
+            // Calculate h(x_i) polynomial function.
+            double h_i = evalH(x_i);
+            // Calculate h(x_j) polynomial function.
+            double h_j = evalH(x_j);
+            h = 0.5 * (h_i + h_j);
+          } else if(CDEAMVERSION == 2) {
+            // Average concentration.
+            double x_ij = 0.5 * (x_i + x_j);
+            // Calculate h(x_ij) polynomial function.
+            h = evalH(x_ij);
+          } else ASSERT(false);
+
+          fpair += h * phip;
+          phi *= h;
+        }
+
+        // Divide by r_ij and negate to get forces from gradient.
+        fpair /= -r;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if(NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if(EFLAG) evdwl = phi;
+        if(EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,evdwl,0.0,
+                                fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCDEAMOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairCDEAM::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_cdeam_omp.h b/src/USER-OMP/pair_cdeam_omp.h
new file mode 100644
index 0000000000..2839ab0ed0
--- /dev/null
+++ b/src/USER-OMP/pair_cdeam_omp.h
@@ -0,0 +1,65 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/cd/omp,PairCDEAM_OneSiteOMP)
+PairStyle(eam/cd/old/omp,PairCDEAM_TwoSiteOMP)
+
+#else
+
+#ifndef LMP_PAIR_CDEAM_OMP_H
+#define LMP_PAIR_CDEAM_OMP_H
+
+#include "pair_cdeam.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCDEAMOMP : public PairCDEAM, public ThrOMP {
+
+ public:
+  PairCDEAMOMP(class LAMMPS *, int);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, int CDEAMVERSION>
+  void eval(int iifrom, int iito, ThrData * const thr);
+};
+
+  /// The one-site concentration formulation of CD-EAM.
+  class PairCDEAM_OneSiteOMP : public PairCDEAMOMP
+  {
+  public:
+    /// Constructor.
+    PairCDEAM_OneSiteOMP(class LAMMPS* lmp) : PairEAM(lmp), PairCDEAMOMP(lmp, 1) {}
+  };
+
+  /// The two-site concentration formulation of CD-EAM.
+  class PairCDEAM_TwoSiteOMP : public PairCDEAMOMP
+  {
+  public:
+    /// Constructor.
+    PairCDEAM_TwoSiteOMP(class LAMMPS* lmp) : PairEAM(lmp), PairCDEAMOMP(lmp, 2) {}
+  };
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_colloid_omp.cpp b/src/USER-OMP/pair_colloid_omp.cpp
new file mode 100644
index 0000000000..e2f5c56d99
--- /dev/null
+++ b/src/USER-OMP/pair_colloid_omp.cpp
@@ -0,0 +1,226 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_colloid_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairColloidOMP::PairColloidOMP(LAMMPS *lmp) :
+  PairColloid(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairColloidOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairColloidOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,r2inv,r6inv,forcelj,factor_lj;
+  double c1,c2,fR,dUR,dUA,K[9],h[4],g[4];
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int tid = thr->get_tid();
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq >= cutsq[itype][jtype]) continue;
+
+      switch(form[itype][jtype]) {
+      case SMALL_SMALL:
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+        if (EFLAG)
+          evdwl = r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
+            offset[itype][jtype];
+        break;
+
+      case SMALL_LARGE:
+        c2 = a2[itype][jtype];
+        K[1] = c2*c2;
+        K[2] = rsq;
+        K[0] = K[1] - rsq;
+        K[4] = rsq*rsq;
+        K[3] = K[1] - K[2];
+        K[3] *= K[3]*K[3];
+        K[6] = K[3]*K[3];
+        fR = sigma3[itype][jtype]*a12[itype][jtype]*c2*K[1]/K[3];
+        fpair = 4.0/15.0*fR*factor_lj *
+          (2.0*(K[1]+K[2]) * (K[1]*(5.0*K[1]+22.0*K[2])+5.0*K[4]) *
+           sigma6[itype][jtype]/K[6]-5.0) / K[0];
+        if (EFLAG)
+          evdwl = 2.0/9.0*fR *
+            (1.0-(K[1]*(K[1]*(K[1]/3.0+3.0*K[2])+4.2*K[4])+K[2]*K[4]) *
+             sigma6[itype][jtype]/K[6]) - offset[itype][jtype];
+
+        if (check_error_thr((rsq <= K[1]),tid,FLERR,
+                            "Overlapping small/large in pair colloid"))
+          return;
+
+        break;
+
+      case LARGE_LARGE:
+        r = sqrt(rsq);
+        c1 = a1[itype][jtype];
+        c2 = a2[itype][jtype];
+        K[0] = c1*c2;
+        K[1] = c1+c2;
+        K[2] = c1-c2;
+        K[3] = K[1]+r;
+        K[4] = K[1]-r;
+        K[5] = K[2]+r;
+        K[6] = K[2]-r;
+        K[7] = 1.0/(K[3]*K[4]);
+        K[8] = 1.0/(K[5]*K[6]);
+        g[0] = pow(K[3],-7.0);
+        g[1] = pow(K[4],-7.0);
+        g[2] = pow(K[5],-7.0);
+        g[3] = pow(K[6],-7.0);
+        h[0] = ((K[3]+5.0*K[1])*K[3]+30.0*K[0])*g[0];
+        h[1] = ((K[4]+5.0*K[1])*K[4]+30.0*K[0])*g[1];
+        h[2] = ((K[5]+5.0*K[2])*K[5]-30.0*K[0])*g[2];
+        h[3] = ((K[6]+5.0*K[2])*K[6]-30.0*K[0])*g[3];
+        g[0] *= 42.0*K[0]/K[3]+6.0*K[1]+K[3];
+        g[1] *= 42.0*K[0]/K[4]+6.0*K[1]+K[4];
+        g[2] *= -42.0*K[0]/K[5]+6.0*K[2]+K[5];
+        g[3] *= -42.0*K[0]/K[6]+6.0*K[2]+K[6];
+
+        fR = a12[itype][jtype]*sigma6[itype][jtype]/r/37800.0;
+        evdwl = fR * (h[0]-h[1]-h[2]+h[3]);
+        dUR = evdwl/r + 5.0*fR*(g[0]+g[1]-g[2]-g[3]);
+        dUA = -a12[itype][jtype]/3.0*r*((2.0*K[0]*K[7]+1.0)*K[7] +
+                                        (2.0*K[0]*K[8]-1.0)*K[8]);
+        fpair = factor_lj * (dUR+dUA)/r;
+        if (EFLAG)
+          evdwl += a12[itype][jtype]/6.0 *
+            (2.0*K[0]*(K[7]+K[8])-log(K[8]/K[7])) - offset[itype][jtype];
+        if (r <= K[1]) error->one(FLERR,"Overlapping large/large in pair colloid");
+        break;
+      }
+
+      if (EFLAG) evdwl *= factor_lj;
+
+      fxtmp += delx*fpair;
+      fytmp += dely*fpair;
+      fztmp += delz*fpair;
+      if (NEWTON_PAIR || j < nlocal) {
+        f[j][0] -= delx*fpair;
+        f[j][1] -= dely*fpair;
+        f[j][2] -= delz*fpair;
+      }
+
+      if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairColloidOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairColloid::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_colloid_omp.h b/src/USER-OMP/pair_colloid_omp.h
new file mode 100644
index 0000000000..6605fbce00
--- /dev/null
+++ b/src/USER-OMP/pair_colloid_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(colloid/omp,PairColloidOMP)
+
+#else
+
+#ifndef LMP_PAIR_COLLOID_OMP_H
+#define LMP_PAIR_COLLOID_OMP_H
+
+#include "pair_colloid.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairColloidOMP : public PairColloid, public ThrOMP {
+
+ public:
+  PairColloidOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_comb_omp.cpp b/src/USER-OMP/pair_comb_omp.cpp
new file mode 100644
index 0000000000..eea2c6b610
--- /dev/null
+++ b/src/USER-OMP/pair_comb_omp.cpp
@@ -0,0 +1,638 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_comb_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "group.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define MAXNEIGH 24
+
+/* ---------------------------------------------------------------------- */
+
+PairCombOMP::PairCombOMP(LAMMPS *lmp) :
+  PairComb(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCombOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = vflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // Build short range neighbor list
+
+  Short_neigh_thr();
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (vflag_atom) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (vflag_atom) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else eval<0,0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+void PairCombOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,ii,jj,kk,jnum,iparam_i;
+  int itag,jtag,itype,jtype,ktype,iparam_ij,iparam_ijk;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,rsq1,rsq2;
+  double delr1[3],delr2[3],fi[3],fj[3],fk[3];
+  double zeta_ij,prefactor;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  int mr1,mr2,mr3;
+  int rsc,inty;
+  double elp_ij,filp[3],fjlp[3],fklp[3];
+  double iq,jq;
+  double yaself;
+  double potal,fac11,fac11e;
+  double vionij,fvionij,sr1,sr2,sr3,Eov,Fov;
+  int sht_jnum, *sht_jlist, nj;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const tag = atom->tag;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  yaself = vionij = fvionij = Eov = Fov = 0.0;
+
+  double fxtmp,fytmp,fztmp;
+  double fjxtmp,fjytmp,fjztmp;
+
+  // self energy correction term: potal
+
+  potal_calc(potal,fac11,fac11e);
+
+  // loop over full neighbor list of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itag = tag[i];
+    itype = map[type[i]];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    iq = q[i];
+    NCo[i] = 0;
+    nj = 0;
+    iparam_i = elem2param[itype][itype][itype];
+
+    // self energy, only on i atom
+
+    yaself = self(&params[iparam_i],iq,potal);
+
+    if (EVFLAG) ev_tally_thr(this,i,i,nlocal,0,yaself,
+                             0.0,0.0,0.0,0.0,0.0,thr);
+
+    // two-body interactions (long and short repulsive)
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    sht_jlist = sht_first[i];
+    sht_jnum = sht_num[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtag = tag[j];
+
+      if (itag > jtag) {
+        if ((itag+jtag) % 2 == 0) continue;
+      } else if (itag < jtag) {
+        if ((itag+jtag) % 2 == 1) continue;
+      } else {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp && x[j][1] < ytmp) continue;
+        if (x[j][2] == ztmp && x[j][1] == ytmp && x[j][0] < xtmp) continue;
+      }
+
+      // Qj calculates 2-body Coulombic
+
+      jtype = map[type[j]];
+      jq = q[j];
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      iparam_ij = elem2param[itype][jtype][jtype];
+
+      // long range q-dependent
+
+      if (rsq > params[iparam_ij].lcutsq) continue;
+
+      inty = intype[itype][jtype];
+
+      // polynomial three-point interpolation
+
+      tri_point(rsq, mr1, mr2, mr3, sr1, sr2, sr3, itype);
+
+      // 1/r energy and forces
+
+      direct(inty,mr1,mr2,mr3,rsq,sr1,sr2,sr3,iq,jq,
+             potal,fac11,fac11e,vionij,fvionij);
+
+      // field correction to self energy
+
+      field(&params[iparam_ij],rsq,iq,jq,vionij,fvionij);
+
+      // polarization field
+      // sums up long range forces
+
+      fxtmp += delx*fvionij;
+      fytmp += dely*fvionij;
+      fztmp += delz*fvionij;
+      f[j][0] -= delx*fvionij;
+      f[j][1] -= dely*fvionij;
+      f[j][2] -= delz*fvionij;
+
+      if (EVFLAG)
+        ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                     0.0,vionij,fvionij,delx,dely,delz,thr);
+
+      // short range q-independent
+
+      if (rsq > params[iparam_ij].cutsq) continue;
+
+      repulsive(&params[iparam_ij],rsq,fpair,EFLAG,evdwl,iq,jq);
+
+      // repulsion is pure two-body, sums up pair repulsive forces
+
+      fxtmp += delx*fpair;
+      fytmp += dely*fpair;
+      fztmp += delz*fpair;
+      f[j][0] -= delx*fpair;
+      f[j][1] -= dely*fpair;
+      f[j][2] -= delz*fpair;
+
+      if (EVFLAG)
+        ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                     evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+
+    // accumulate coordination number information
+
+    if (cor_flag) {
+      int numcoor = 0;
+      for (jj = 0; jj < sht_jnum; jj++) {
+        j = sht_jlist[jj];
+        jtype = map[type[j]];
+        iparam_ij = elem2param[itype][jtype][jtype];
+
+        if(params[iparam_ij].hfocor > 0.0 ) {
+          delr1[0] = x[j][0] - xtmp;
+          delr1[1] = x[j][1] - ytmp;
+          delr1[2] = x[j][2] - ztmp;
+          rsq1 = vec3_dot(delr1,delr1);
+
+          if (rsq1 > params[iparam_ij].cutsq) continue;
+          ++numcoor;
+        }
+      }
+      NCo[i] = numcoor;
+    }
+
+    // three-body interactions
+    // half i-j loop
+
+    for (jj = 0; jj < sht_jnum; jj++) {
+      j = sht_jlist[jj];
+
+      jtype = map[type[j]];
+      iparam_ij = elem2param[itype][jtype][jtype];
+
+      // this Qj for q-dependent BSi
+
+      jq = q[j];
+
+      delr1[0] = x[j][0] - xtmp;
+      delr1[1] = x[j][1] - ytmp;
+      delr1[2] = x[j][2] - ztmp;
+      rsq1 = vec3_dot(delr1,delr1);
+
+      if (rsq1 > params[iparam_ij].cutsq) continue;
+      nj ++;
+
+      // accumulate bondorder zeta for each i-j interaction via loop over k
+
+      fjxtmp = fjytmp = fjztmp = 0.0;
+      zeta_ij = 0.0;
+      cuo_flag1 = 0; cuo_flag2 = 0;
+
+      for (kk = 0; kk < sht_jnum; kk++) {
+        k = sht_jlist[kk];
+        if (j == k) continue;
+        ktype = map[type[k]];
+        iparam_ijk = elem2param[itype][jtype][ktype];
+
+        delr2[0] = x[k][0] - xtmp;
+        delr2[1] = x[k][1] - ytmp;
+        delr2[2] = x[k][2] - ztmp;
+        rsq2 = vec3_dot(delr2,delr2);
+
+        if (rsq2 > params[iparam_ijk].cutsq) continue;
+
+        zeta_ij += zeta(&params[iparam_ijk],rsq1,rsq2,delr1,delr2);
+
+        if (params[iparam_ijk].hfocor == -2.0) cuo_flag1 = 1;
+        if (params[iparam_ijk].hfocor == -1.0) cuo_flag2 = 1;
+      }
+
+      if (cuo_flag1 && cuo_flag2) cuo_flag = 1;
+      else cuo_flag = 0;
+
+      force_zeta(&params[iparam_ij],EFLAG,i,nj,rsq1,zeta_ij,
+                 iq,jq,fpair,prefactor,evdwl);
+
+      // over-coordination correction for HfO2
+
+      if (cor_flag && NCo[i] != 0)
+        Over_cor(&params[iparam_ij],rsq1,NCo[i],Eov, Fov);
+      evdwl +=  Eov;
+      fpair +=  Fov;
+
+      fxtmp += delr1[0]*fpair;
+      fytmp += delr1[1]*fpair;
+      fztmp += delr1[2]*fpair;
+      fjxtmp -= delr1[0]*fpair;
+      fjytmp -= delr1[1]*fpair;
+      fjztmp -= delr1[2]*fpair;
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,evdwl,0.0,
+                               -fpair,-delr1[0],-delr1[1],-delr1[2],thr);
+
+      // attractive term via loop over k (3-body forces)
+
+      for (kk = 0; kk < sht_jnum; kk++) {
+        k = sht_jlist[kk];
+        if (j == k) continue;
+        ktype = map[type[k]];
+        iparam_ijk = elem2param[itype][jtype][ktype];
+
+        delr2[0] = x[k][0] - xtmp;
+        delr2[1] = x[k][1] - ytmp;
+        delr2[2] = x[k][2] - ztmp;
+        rsq2 = vec3_dot(delr2,delr2);
+        if (rsq2 > params[iparam_ijk].cutsq) continue;
+
+        for (rsc = 0; rsc < 3; rsc++)
+          fi[rsc] = fj[rsc] = fk[rsc] = 0.0;
+
+        attractive(&params[iparam_ijk],prefactor,
+                   rsq1,rsq2,delr1,delr2,fi,fj,fk);
+
+        // 3-body LP and BB correction and forces
+
+        elp_ij = elp(&params[iparam_ijk],rsq1,rsq2,delr1,delr2);
+        flp(&params[iparam_ijk],rsq1,rsq2,delr1,delr2,filp,fjlp,fklp);
+
+        fxtmp += fi[0] + filp[0];
+        fytmp += fi[1] + filp[1];
+        fztmp += fi[2] + filp[2];
+        fjxtmp += fj[0] + fjlp[0];
+        fjytmp += fj[1] + fjlp[1];
+        fjztmp += fj[2] + fjlp[2];
+        f[k][0] += fk[0] + fklp[0];
+        f[k][1] += fk[1] + fklp[1];
+        f[k][2] += fk[2] + fklp[2];
+
+        if (EVFLAG)
+          ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                       elp_ij,0.0,0.0,0.0,0.0,0.0, thr);
+        if (VFLAG_ATOM) v_tally3_thr(i,j,k,fj,fk,delr1,delr2,thr);
+      }
+      f[j][0] += fjxtmp;
+      f[j][1] += fjytmp;
+      f[j][2] += fjztmp;
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+
+    if (cuo_flag) params[iparam_i].cutsq *= 0.65;
+  }
+  cuo_flag = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCombOMP::yasu_char(double *qf_fix, int &igroup)
+{
+  int ii;
+  double potal,fac11,fac11e;
+
+  const double * const * const x = atom->x;
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int * const tag = atom->tag;
+
+  const int inum = list->inum;
+  const int * const ilist = list->ilist;
+  const int * const numneigh = list->numneigh;
+  const int * const * const firstneigh = list->firstneigh;
+
+  const int * const mask = atom->mask;
+  const int groupbit = group->bitmask[igroup];
+
+  qf = qf_fix;
+  for (ii = 0; ii < inum; ii++) {
+    const int i = ilist[ii];
+    if (mask[i] & groupbit)
+      qf[i] = 0.0;
+  }
+
+  // communicating charge force to all nodes, first forward then reverse
+
+  comm->forward_comm_pair(this);
+
+  // self energy correction term: potal
+
+  potal_calc(potal,fac11,fac11e);
+
+  // loop over full neighbor list of my atoms
+#if defined(_OPENMP)
+#pragma omp parallel for private(ii) default(none) shared(potal,fac11e)
+#endif
+  for (ii = 0; ii < inum; ii ++) {
+    double fqi,fqj,fqij,fqji,fqjj,delr1[3];
+    double sr1,sr2,sr3;
+    int mr1,mr2,mr3;
+
+    const int i = ilist[ii];
+    const int itag = tag[i];
+    int nj = 0;
+
+    if (mask[i] & groupbit) {
+      fqi = fqj = fqij = fqji = fqjj = 0.0; // should not be needed.
+      int itype = map[type[i]];
+      const double xtmp = x[i][0];
+      const double ytmp = x[i][1];
+      const double ztmp = x[i][2];
+      const double iq = q[i];
+      const int iparam_i = elem2param[itype][itype][itype];
+
+      // charge force from self energy
+
+      fqi = qfo_self(&params[iparam_i],iq,potal);
+
+      // two-body interactions
+
+      const int * const jlist = firstneigh[i];
+      const int jnum = numneigh[i];
+
+      for (int jj = 0; jj < jnum; jj++) {
+        const int j = jlist[jj] & NEIGHMASK;
+        const int jtag = tag[j];
+
+        if (itag > jtag) {
+          if ((itag+jtag) % 2 == 0) continue;
+        } else if (itag < jtag) {
+          if ((itag+jtag) % 2 == 1) continue;
+        } else {
+          if (x[j][2] < ytmp) continue;
+          if (x[j][2] == ztmp && x[j][1] < ytmp) continue;
+          if (x[j][2] == ztmp && x[j][1] == ytmp && x[j][0] < xtmp) continue;
+        }
+
+        const int jtype = map[type[j]];
+        double jq = q[j];
+
+        delr1[0] = x[j][0] - xtmp;
+        delr1[1] = x[j][1] - ytmp;
+        delr1[2] = x[j][2] - ztmp;
+        double rsq1 = vec3_dot(delr1,delr1);
+
+        const int iparam_ij = elem2param[itype][jtype][jtype];
+
+        // long range q-dependent
+
+        if (rsq1 > params[iparam_ij].lcutsq) continue;
+
+        const int inty = intype[itype][jtype];
+
+        // polynomial three-point interpolation
+
+        tri_point(rsq1,mr1,mr2,mr3,sr1,sr2,sr3,itype);
+
+        // 1/r charge forces
+
+        qfo_direct(inty,mr1,mr2,mr3,rsq1,sr1,sr2,sr3,fac11e,fqij);
+
+        // field correction to self energy and charge force
+
+        qfo_field(&params[iparam_ij],rsq1,iq,jq,fqji,fqjj);
+        fqi   += jq * fqij + fqji;
+#if defined(_OPENMP)
+#pragma omp atomic
+#endif
+        qf[j] += (iq * fqij + fqjj);
+      }
+
+        // three-body interactions
+
+      for (int jj = 0; jj < jnum; jj++) {
+        const int j = jlist[jj] & NEIGHMASK;
+        const int jtype = map[type[j]];
+        const double jq = q[j];
+
+        delr1[0] = x[j][0] - xtmp;
+        delr1[1] = x[j][1] - ytmp;
+        delr1[2] = x[j][2] - ztmp;
+        double rsq1 = vec3_dot(delr1,delr1);
+
+        const int iparam_ij = elem2param[itype][jtype][jtype];
+
+        if (rsq1 > params[iparam_ij].cutsq) continue;
+        nj ++;
+
+        // charge force in Aij and Bij
+
+        qfo_short(&params[iparam_ij],i,nj,rsq1,iq,jq,fqij,fqjj);
+        fqi += fqij;
+#if defined(_OPENMP)
+#pragma omp atomic
+#endif
+        qf[j] += fqjj;
+      }
+
+#if defined(_OPENMP)
+#pragma omp atomic
+#endif
+      qf[i] += fqi;
+    }
+  }
+
+  comm->reverse_comm_pair(this);
+
+  // sum charge force on each node and return it
+
+  double eneg = 0.0;
+  for (ii = 0; ii < inum; ii++) {
+    const int i = ilist[ii];
+    if (mask[i] & groupbit)
+      eneg += qf[i];
+  }
+  double enegtot;
+  MPI_Allreduce(&eneg,&enegtot,1,MPI_DOUBLE,MPI_SUM,world);
+  return enegtot;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCombOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairComb::memory_usage();
+
+  return bytes;
+}
+/* ---------------------------------------------------------------------- */
+
+void PairCombOMP::Short_neigh_thr()
+{
+
+  if (atom->nmax > nmax) {
+    nmax = atom->nmax;
+    memory->sfree(sht_first);
+    sht_first = (int **) memory->smalloc(nmax*sizeof(int *),
+            "pair:sht_first");
+    memory->grow(sht_num,nmax,"pair:sht_num");
+    memory->grow(NCo,nmax,"pair:NCo");
+    memory->grow(bbij,nmax,nmax,"pair:bbij");
+  }
+
+  const int nthreads = comm->nthreads;
+  if (nthreads > maxpage)
+    add_pages(nthreads - maxpage);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int nj,npntj,*neighptrj;
+    int iparam_ij,*ilist,*jlist,*numneigh,**firstneigh;
+    int jnum,i,j,ii,jj;
+    double xtmp,ytmp,ztmp,rsq,delrj[3];
+    double **x = atom->x;
+    int *type  = atom->type;
+
+    const int inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+
+
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    const int iidelta = 1 + inum/nthreads;
+    const int iifrom = tid*iidelta;
+    int iito   = iifrom + iidelta;
+    if (iito > inum) iito = inum;
+
+    int npage = tid;
+    npntj = 0;
+
+    if (iifrom < inum) {
+      for (ii = iifrom; ii < iito; ii++) {
+        i = ilist[ii];
+
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+        if(pgsize - npntj < oneatom) {
+          npntj = 0;
+          npage += nthreads;
+          if (npage >= maxpage) add_pages(nthreads);
+        }
+
+        neighptrj = &pages[npage][npntj];
+        nj = 0;
+
+        xtmp = x[i][0];
+        ytmp = x[i][1];
+        ztmp = x[i][2];
+
+        jlist = firstneigh[i];
+        jnum = numneigh[i];
+
+        for (jj = 0; jj < jnum; jj++) {
+          j = jlist[jj];
+          j &= NEIGHMASK;
+
+          delrj[0] = xtmp - x[j][0];
+          delrj[1] = ytmp - x[j][1];
+          delrj[2] = ztmp - x[j][2];
+          rsq = vec3_dot(delrj,delrj);
+
+          if (rsq > cutmin) continue;
+          neighptrj[nj++] = j;
+        }
+        sht_first[i] = neighptrj;
+        sht_num[i] = nj;
+        npntj += nj;
+      }
+    }
+  }
+}
diff --git a/src/USER-OMP/pair_comb_omp.h b/src/USER-OMP/pair_comb_omp.h
new file mode 100644
index 0000000000..85011064e7
--- /dev/null
+++ b/src/USER-OMP/pair_comb_omp.h
@@ -0,0 +1,47 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(comb/omp,PairCombOMP)
+
+#else
+
+#ifndef LMP_PAIR_COMB_OMP_H
+#define LMP_PAIR_COMB_OMP_H
+
+#include "pair_comb.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCombOMP : public PairComb, public ThrOMP {
+
+ public:
+  PairCombOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+  virtual double yasu_char(double *, int &);
+
+ private:
+  template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+  void eval(int ifrom, int ito, ThrData * const thr);
+
+  void Short_neigh_thr();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_coul_cut_omp.cpp b/src/USER-OMP/pair_coul_cut_omp.cpp
new file mode 100644
index 0000000000..37b311f93e
--- /dev/null
+++ b/src/USER-OMP/pair_coul_cut_omp.cpp
@@ -0,0 +1,160 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_coul_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulCutOMP::PairCoulCutOMP(LAMMPS *lmp) :
+  PairCoulCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCoulCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairCoulCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
+  double rsq,r2inv,rinv,forcecoul,factor_coul;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        rinv = sqrt(r2inv);
+        forcecoul = qqrd2e * scale[itype][jtype] * qtmp*q[j]*rinv;
+        fpair = factor_coul*forcecoul * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG)
+          ecoul = factor_coul * qqrd2e * scale[itype][jtype] * qtmp*q[j]*rinv;
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 0.0,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCoulCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairCoulCut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_coul_cut_omp.h b/src/USER-OMP/pair_coul_cut_omp.h
new file mode 100644
index 0000000000..9e5fe81197
--- /dev/null
+++ b/src/USER-OMP/pair_coul_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(coul/cut/omp,PairCoulCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_COUL_CUT_OMP_H
+#define LMP_PAIR_COUL_CUT_OMP_H
+
+#include "pair_coul_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCoulCutOMP : public PairCoulCut, public ThrOMP {
+
+ public:
+  PairCoulCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_coul_debye_omp.cpp b/src/USER-OMP/pair_coul_debye_omp.cpp
new file mode 100644
index 0000000000..4c8aaba007
--- /dev/null
+++ b/src/USER-OMP/pair_coul_debye_omp.cpp
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_coul_debye_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulDebyeOMP::PairCoulDebyeOMP(LAMMPS *lmp) :
+  PairCoulDebye(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCoulDebyeOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairCoulDebyeOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
+  double rsq,r2inv,r,rinv,forcecoul,factor_coul,screening;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+        rinv = 1.0/r;
+        screening = exp(-kappa*r);
+        forcecoul = qqrd2e * qtmp*q[j] * screening * (kappa + rinv);
+        fpair = factor_coul*forcecoul * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG)
+          ecoul = factor_coul * qqrd2e * qtmp*q[j] * rinv * screening;
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 0.0,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+double PairCoulDebyeOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairCoulDebye::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_coul_debye_omp.h b/src/USER-OMP/pair_coul_debye_omp.h
new file mode 100644
index 0000000000..dc08647553
--- /dev/null
+++ b/src/USER-OMP/pair_coul_debye_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(coul/debye/omp,PairCoulDebyeOMP)
+
+#else
+
+#ifndef LMP_PAIR_COUL_DEBYE_OMP_H
+#define LMP_PAIR_COUL_DEBYE_OMP_H
+
+#include "pair_coul_debye.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCoulDebyeOMP : public PairCoulDebye, public ThrOMP {
+
+ public:
+  PairCoulDebyeOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_coul_diel_omp.cpp b/src/USER-OMP/pair_coul_diel_omp.cpp
new file mode 100644
index 0000000000..c583800004
--- /dev/null
+++ b/src/USER-OMP/pair_coul_diel_omp.cpp
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_coul_diel_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulDielOMP::PairCoulDielOMP(LAMMPS *lmp) :
+  PairCoulDiel(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCoulDielOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairCoulDielOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
+  double rsq,r,rarg,th,depsdr,epsr,forcecoul,factor_coul;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        rarg = (r-rme[itype][jtype])/sigmae[itype][jtype];
+        th=tanh(rarg);
+        epsr=a_eps+b_eps*th;
+        depsdr=b_eps * (1.0 - th*th) / sigmae[itype][jtype];
+
+        forcecoul = qqrd2e*qtmp*q[j]*((eps_s*(epsr+r*depsdr)/epsr/epsr) -1.)/rsq;
+        fpair = factor_coul*forcecoul/r;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          ecoul = (qqrd2e*qtmp*q[j]*((eps_s/epsr) -1.)/r) - offset[itype][jtype];
+          ecoul *= factor_coul;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 0.0,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCoulDielOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairCoulDiel::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_coul_diel_omp.h b/src/USER-OMP/pair_coul_diel_omp.h
new file mode 100644
index 0000000000..68206e4c28
--- /dev/null
+++ b/src/USER-OMP/pair_coul_diel_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(coul/diel/omp,PairCoulDielOMP)
+
+#else
+
+#ifndef LMP_PAIR_COUL_DIEL_OMP_H
+#define LMP_PAIR_COUL_DIEL_OMP_H
+
+#include "pair_coul_diel.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCoulDielOMP : public PairCoulDiel, public ThrOMP {
+
+ public:
+  PairCoulDielOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_coul_long_omp.cpp b/src/USER-OMP/pair_coul_long_omp.cpp
new file mode 100644
index 0000000000..5ae5478755
--- /dev/null
+++ b/src/USER-OMP/pair_coul_long_omp.cpp
@@ -0,0 +1,200 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulLongOMP::PairCoulLongOMP(LAMMPS *lmp) :
+  PairCoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairCoulLongOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itable,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
+  double fraction,table;
+  double r,r2inv,rsq,forcecoul,factor_coul;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cut_coulsq) {
+        r2inv = 1.0/rsq;
+        if (!ncoultablebits || rsq <= tabinnersq) {
+          r = sqrt(rsq);
+          grij = g_ewald * r;
+          expm2 = exp(-grij*grij);
+          t = 1.0 / (1.0 + EWALD_P*grij);
+          erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * scale[itype][jtype] * qtmp*q[j]/r;
+          forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+          if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+        } else {
+          union_int_float_t rsq_lookup;
+          rsq_lookup.f = rsq;
+          itable = rsq_lookup.i & ncoulmask;
+          itable >>= ncoulshiftbits;
+          fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+          table = ftable[itable] + fraction*dftable[itable];
+          forcecoul = scale[itype][jtype] * qtmp*q[j] * table;
+          if (factor_coul < 1.0) {
+            table = ctable[itable] + fraction*dctable[itable];
+            prefactor = scale[itype][jtype] * qtmp*q[j] * table;
+            forcecoul -= (1.0-factor_coul)*prefactor;
+          }
+        }
+
+        fpair = forcecoul * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (!ncoultablebits || rsq <= tabinnersq)
+            ecoul = prefactor*erfc;
+          else {
+            table = etable[itable] + fraction*detable[itable];
+            ecoul = scale[itype][jtype] * qtmp*q[j] * table;
+          }
+          if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 0.0,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCoulLongOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_coul_long_omp.h b/src/USER-OMP/pair_coul_long_omp.h
new file mode 100644
index 0000000000..d628d22b60
--- /dev/null
+++ b/src/USER-OMP/pair_coul_long_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(coul/long/omp,PairCoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_COUL_LONG_OMP_H
+#define LMP_PAIR_COUL_LONG_OMP_H
+
+#include "pair_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCoulLongOMP : public PairCoulLong, public ThrOMP {
+
+ public:
+  PairCoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_coul_wolf_omp.cpp b/src/USER-OMP/pair_coul_wolf_omp.cpp
new file mode 100644
index 0000000000..16e0c4ed90
--- /dev/null
+++ b/src/USER-OMP/pair_coul_wolf_omp.cpp
@@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_coul_wolf_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulWolfOMP::PairCoulWolfOMP(LAMMPS *lmp) :
+  PairCoulWolf(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCoulWolfOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairCoulWolfOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
+  double rsq,forcecoul,factor_coul;
+  double prefactor;
+  double r,rexp;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double erfcc,erfcd,v_sh,dvdrr,e_self,e_shift,f_shift,qisq;
+
+  ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  // self and shifted coulombic energy
+
+  e_self = v_sh = 0.0;
+  e_shift = erfc(alf*cut_coul)/cut_coul;
+  f_shift = -(e_shift+ 2.0*alf/MY_PIS * exp(-alf*alf*cut_coul*cut_coul)) /
+    cut_coul;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    qisq = qtmp*qtmp;
+    e_self = -(e_shift/2.0 + alf/MY_PIS) * qisq*qqrd2e;
+    if (EVFLAG) ev_tally_thr(this,i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0,thr);
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_coulsq) {
+        r = sqrt(rsq);
+        prefactor = qqrd2e*qtmp*q[j]/r;
+        erfcc = erfc(alf*r);
+        erfcd = exp(-alf*alf*r*r);
+        v_sh = (erfcc - e_shift*r) * prefactor;
+        dvdrr = (erfcc/rsq + 2.0*alf/MY_PIS * erfcd/r) + f_shift;
+        forcecoul = dvdrr*rsq*prefactor;
+        if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+        fpair = forcecoul / rsq;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = v_sh;
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 0.0,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCoulWolfOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairCoulWolf::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_coul_wolf_omp.h b/src/USER-OMP/pair_coul_wolf_omp.h
new file mode 100644
index 0000000000..aef683d22c
--- /dev/null
+++ b/src/USER-OMP/pair_coul_wolf_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(coul/wolf/omp,PairCoulWolfOMP)
+
+#else
+
+#ifndef LMP_PAIR_COUL_WOLF_OMP_H
+#define LMP_PAIR_COUL_WOLF_OMP_H
+
+#include "pair_coul_wolf.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCoulWolfOMP : public PairCoulWolf, public ThrOMP {
+
+ public:
+  PairCoulWolfOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_dipole_cut_omp.cpp b/src/USER-OMP/pair_dipole_cut_omp.cpp
new file mode 100644
index 0000000000..34d7e6c17e
--- /dev/null
+++ b/src/USER-OMP/pair_dipole_cut_omp.cpp
@@ -0,0 +1,286 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_dipole_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairDipoleCutOMP::PairDipoleCutOMP(LAMMPS *lmp) :
+  PairDipoleCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDipoleCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairDipoleCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,qtmp,delx,dely,delz,evdwl,ecoul;
+  double rsq,rinv,r2inv,r6inv,r3inv,r5inv,r7inv,fx,fy,fz;
+  double forcecoulx,forcecouly,forcecoulz,crossx,crossy,crossz;
+  double tixcoul,tiycoul,tizcoul,tjxcoul,tjycoul,tjzcoul;
+  double fq,pdotp,pidotr,pjdotr,pre1,pre2,pre3,pre4;
+  double forcelj,factor_coul,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const double * const q = atom->q;
+  const double * const * const mu = atom->mu;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp,t1tmp,t2tmp,t3tmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    qtmp = q[i];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_coul = special_coul[sbmask(j)];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        rinv = sqrt(r2inv);
+
+        // atom can have both a charge and dipole
+        // i,j = charge-charge, dipole-dipole, dipole-charge, or charge-dipole
+
+        forcecoulx = forcecouly = forcecoulz = 0.0;
+        tixcoul = tiycoul = tizcoul = 0.0;
+        tjxcoul = tjycoul = tjzcoul = 0.0;
+
+        if (rsq < cut_coulsq[itype][jtype]) {
+
+          if (qtmp != 0.0 && q[j] != 0.0) {
+            r3inv = r2inv*rinv;
+            pre1 = qtmp*q[j]*r3inv;
+
+            forcecoulx += pre1*delx;
+            forcecouly += pre1*dely;
+            forcecoulz += pre1*delz;
+          }
+
+          if (mu[i][3] > 0.0 && mu[j][3] > 0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            r7inv = r5inv*r2inv;
+
+            pdotp = mu[i][0]*mu[j][0] + mu[i][1]*mu[j][1] + mu[i][2]*mu[j][2];
+            pidotr = mu[i][0]*delx + mu[i][1]*dely + mu[i][2]*delz;
+            pjdotr = mu[j][0]*delx + mu[j][1]*dely + mu[j][2]*delz;
+
+            pre1 = 3.0*r5inv*pdotp - 15.0*r7inv*pidotr*pjdotr;
+            pre2 = 3.0*r5inv*pjdotr;
+            pre3 = 3.0*r5inv*pidotr;
+            pre4 = -1.0*r3inv;
+
+            forcecoulx += pre1*delx + pre2*mu[i][0] + pre3*mu[j][0];
+            forcecouly += pre1*dely + pre2*mu[i][1] + pre3*mu[j][1];
+            forcecoulz += pre1*delz + pre2*mu[i][2] + pre3*mu[j][2];
+
+            crossx = pre4 * (mu[i][1]*mu[j][2] - mu[i][2]*mu[j][1]);
+            crossy = pre4 * (mu[i][2]*mu[j][0] - mu[i][0]*mu[j][2]);
+            crossz = pre4 * (mu[i][0]*mu[j][1] - mu[i][1]*mu[j][0]);
+
+            tixcoul += crossx + pre2 * (mu[i][1]*delz - mu[i][2]*dely);
+            tiycoul += crossy + pre2 * (mu[i][2]*delx - mu[i][0]*delz);
+            tizcoul += crossz + pre2 * (mu[i][0]*dely - mu[i][1]*delx);
+            tjxcoul += -crossx + pre3 * (mu[j][1]*delz - mu[j][2]*dely);
+            tjycoul += -crossy + pre3 * (mu[j][2]*delx - mu[j][0]*delz);
+            tjzcoul += -crossz + pre3 * (mu[j][0]*dely - mu[j][1]*delx);
+          }
+
+          if (mu[i][3] > 0.0 && q[j] != 0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pidotr = mu[i][0]*delx + mu[i][1]*dely + mu[i][2]*delz;
+            pre1 = 3.0*q[j]*r5inv * pidotr;
+            pre2 = q[j]*r3inv;
+
+            forcecoulx += pre2*mu[i][0] - pre1*delx;
+            forcecouly += pre2*mu[i][1] - pre1*dely;
+            forcecoulz += pre2*mu[i][2] - pre1*delz;
+            tixcoul += pre2 * (mu[i][1]*delz - mu[i][2]*dely);
+            tiycoul += pre2 * (mu[i][2]*delx - mu[i][0]*delz);
+            tizcoul += pre2 * (mu[i][0]*dely - mu[i][1]*delx);
+          }
+
+          if (mu[j][3] > 0.0 && qtmp != 0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pjdotr = mu[j][0]*delx + mu[j][1]*dely + mu[j][2]*delz;
+            pre1 = 3.0*qtmp*r5inv * pjdotr;
+            pre2 = qtmp*r3inv;
+
+            forcecoulx += pre1*delx - pre2*mu[j][0];
+            forcecouly += pre1*dely - pre2*mu[j][1];
+            forcecoulz += pre1*delz - pre2*mu[j][2];
+            tjxcoul += -pre2 * (mu[j][1]*delz - mu[j][2]*dely);
+            tjycoul += -pre2 * (mu[j][2]*delx - mu[j][0]*delz);
+            tjzcoul += -pre2 * (mu[j][0]*dely - mu[j][1]*delx);
+          }
+        }
+
+        // LJ interaction
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= factor_lj * r2inv;
+        } else forcelj = 0.0;
+
+        // total force
+
+        fq = factor_coul*qqrd2e;
+        fx = fq*forcecoulx + delx*forcelj;
+        fy = fq*forcecouly + dely*forcelj;
+        fz = fq*forcecoulz + delz*forcelj;
+
+        // force & torque accumulation
+
+        fxtmp += fx;
+        fytmp += fy;
+        fztmp += fz;
+        t1tmp += fq*tixcoul;
+        t2tmp += fq*tiycoul;
+        t3tmp += fq*tizcoul;
+
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= fx;
+          f[j][1] -= fy;
+          f[j][2] -= fz;
+          torque[j][0] += fq*tjxcoul;
+          torque[j][1] += fq*tjycoul;
+          torque[j][2] += fq*tjzcoul;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq[itype][jtype]) {
+            ecoul = qtmp*q[j]*rinv;
+            if (mu[i][3] > 0.0 && mu[j][3] > 0.0)
+              ecoul += r3inv*pdotp - 3.0*r5inv*pidotr*pjdotr;
+            if (mu[i][3] > 0.0 && q[j] != 0.0)
+              ecoul += -q[j]*r3inv*pidotr;
+            if (mu[j][3] > 0.0 && qtmp != 0.0)
+              ecoul += qtmp*r3inv*pjdotr;
+            ecoul *= factor_coul*qqrd2e;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                     evdwl,ecoul,fx,fy,fz,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+    torque[i][0] += t1tmp;
+    torque[i][1] += t2tmp;
+    torque[i][2] += t3tmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairDipoleCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairDipoleCut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_dipole_cut_omp.h b/src/USER-OMP/pair_dipole_cut_omp.h
new file mode 100644
index 0000000000..4e874b1ffb
--- /dev/null
+++ b/src/USER-OMP/pair_dipole_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dipole/cut/omp,PairDipoleCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_DIPOLE_CUT_OMP_H
+#define LMP_PAIR_DIPOLE_CUT_OMP_H
+
+#include "pair_dipole_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairDipoleCutOMP : public PairDipoleCut, public ThrOMP {
+
+ public:
+  PairDipoleCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_dipole_sf_omp.cpp b/src/USER-OMP/pair_dipole_sf_omp.cpp
new file mode 100644
index 0000000000..4090bc3c02
--- /dev/null
+++ b/src/USER-OMP/pair_dipole_sf_omp.cpp
@@ -0,0 +1,318 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_dipole_sf_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairDipoleSFOMP::PairDipoleSFOMP(LAMMPS *lmp) :
+  PairDipoleSF(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDipoleSFOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairDipoleSFOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,qtmp,delx,dely,delz,evdwl,ecoul;
+  double rsq,rinv,r2inv,r6inv,r3inv,r5inv,fx,fy,fz;
+  double forcecoulx,forcecouly,forcecoulz,crossx,crossy,crossz;
+  double tixcoul,tiycoul,tizcoul,tjxcoul,tjycoul,tjzcoul;
+  double fq,pdotp,pidotr,pjdotr,pre1,pre2,pre3,pre4;
+  double forcelj,factor_coul,factor_lj;
+  double presf,afac,bfac,pqfac,qpfac,forceljcut,forceljsf;
+  double aforcecoulx,aforcecouly,aforcecoulz;
+  double bforcecoulx,bforcecouly,bforcecoulz;
+  double rcutlj2inv, rcutcoul2inv,rcutlj6inv;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const double * const q = atom->q;
+  const double * const * const mu = atom->mu;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp,t1tmp,t2tmp,t3tmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    qtmp = q[i];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_coul = special_coul[sbmask(j)];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        rinv = sqrt(r2inv);
+
+        // atom can have both a charge and dipole
+        // i,j = charge-charge, dipole-dipole, dipole-charge, or charge-dipole
+        // atom can have both a charge and dipole
+        // i,j = charge-charge, dipole-dipole, dipole-charge, or charge-dipole
+
+        forcecoulx = forcecouly = forcecoulz = 0.0;
+        tixcoul = tiycoul = tizcoul = 0.0;
+        tjxcoul = tjycoul = tjzcoul = 0.0;
+
+        if (rsq < cut_coulsq[itype][jtype]) {
+
+          if (qtmp != 0.0 && q[j] != 0.0) {
+            pre1 = qtmp*q[j]*rinv*(r2inv-1.0/cut_coulsq[itype][jtype]);
+
+            forcecoulx += pre1*delx;
+            forcecouly += pre1*dely;
+            forcecoulz += pre1*delz;
+          }
+
+          if (mu[i][3] > 0.0 && mu[j][3] > 0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            rcutcoul2inv=1.0/cut_coulsq[itype][jtype];
+
+            pdotp = mu[i][0]*mu[j][0] + mu[i][1]*mu[j][1] + mu[i][2]*mu[j][2];
+            pidotr = mu[i][0]*delx + mu[i][1]*dely + mu[i][2]*delz;
+            pjdotr = mu[j][0]*delx + mu[j][1]*dely + mu[j][2]*delz;
+
+            afac = 1.0 - rsq*rsq * rcutcoul2inv*rcutcoul2inv;
+            pre1 = afac * ( pdotp - 3.0 * r2inv * pidotr * pjdotr );
+            aforcecoulx = pre1*delx;
+            aforcecouly = pre1*dely;
+            aforcecoulz = pre1*delz;
+
+            bfac = 1.0 - 4.0*rsq*sqrt(rsq)*rcutcoul2inv*sqrt(rcutcoul2inv) +
+              3.0*rsq*rsq*rcutcoul2inv*rcutcoul2inv;
+            presf = 2.0 * r2inv * pidotr * pjdotr;
+            bforcecoulx = bfac * (pjdotr*mu[i][0]+pidotr*mu[j][0]-presf*delx);
+            bforcecouly = bfac * (pjdotr*mu[i][1]+pidotr*mu[j][1]-presf*dely);
+            bforcecoulz = bfac * (pjdotr*mu[i][2]+pidotr*mu[j][2]-presf*delz);
+
+            forcecoulx += 3.0 * r5inv * ( aforcecoulx + bforcecoulx );
+            forcecouly += 3.0 * r5inv * ( aforcecouly + bforcecouly );
+            forcecoulz += 3.0 * r5inv * ( aforcecoulz + bforcecoulz );
+
+            pre2 = 3.0 * bfac * r5inv * pjdotr;
+            pre3 = 3.0 * bfac * r5inv * pidotr;
+            pre4 = -bfac * r3inv;
+
+            crossx = pre4 * (mu[i][1]*mu[j][2] - mu[i][2]*mu[j][1]);
+            crossy = pre4 * (mu[i][2]*mu[j][0] - mu[i][0]*mu[j][2]);
+            crossz = pre4 * (mu[i][0]*mu[j][1] - mu[i][1]*mu[j][0]);
+
+            tixcoul += crossx + pre2 * (mu[i][1]*delz - mu[i][2]*dely);
+            tiycoul += crossy + pre2 * (mu[i][2]*delx - mu[i][0]*delz);
+            tizcoul += crossz + pre2 * (mu[i][0]*dely - mu[i][1]*delx);
+            tjxcoul += -crossx + pre3 * (mu[j][1]*delz - mu[j][2]*dely);
+            tjycoul += -crossy + pre3 * (mu[j][2]*delx - mu[j][0]*delz);
+            tjzcoul += -crossz + pre3 * (mu[j][0]*dely - mu[j][1]*delx);
+          }
+
+          if (mu[i][3] > 0.0 && q[j] != 0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pidotr = mu[i][0]*delx + mu[i][1]*dely + mu[i][2]*delz;
+            rcutcoul2inv=1.0/cut_coulsq[itype][jtype];
+            pre1 = 3.0 * q[j] * r5inv * pidotr * (1-rsq*rcutcoul2inv);
+            pqfac = 1.0 - 3.0*rsq*rcutcoul2inv +
+              2.0*rsq*sqrt(rsq)*rcutcoul2inv*sqrt(rcutcoul2inv);
+            pre2 = q[j] * r3inv * pqfac;
+
+            forcecoulx += pre2*mu[i][0] - pre1*delx;
+            forcecouly += pre2*mu[i][1] - pre1*dely;
+            forcecoulz += pre2*mu[i][2] - pre1*delz;
+            tixcoul += pre2 * (mu[i][1]*delz - mu[i][2]*dely);
+            tiycoul += pre2 * (mu[i][2]*delx - mu[i][0]*delz);
+            tizcoul += pre2 * (mu[i][0]*dely - mu[i][1]*delx);
+          }
+
+          if (mu[j][3] > 0.0 && qtmp != 0.0) {
+            r3inv = r2inv*rinv;
+            r5inv = r3inv*r2inv;
+            pjdotr = mu[j][0]*delx + mu[j][1]*dely + mu[j][2]*delz;
+            rcutcoul2inv=1.0/cut_coulsq[itype][jtype];
+            pre1 = 3.0 * qtmp * r5inv * pjdotr * (1-rsq*rcutcoul2inv);
+            qpfac = 1.0 - 3.0*rsq*rcutcoul2inv +
+              2.0*rsq*sqrt(rsq)*rcutcoul2inv*sqrt(rcutcoul2inv);
+            pre2 = qtmp * r3inv * qpfac;
+
+            forcecoulx += pre1*delx - pre2*mu[j][0];
+            forcecouly += pre1*dely - pre2*mu[j][1];
+            forcecoulz += pre1*delz - pre2*mu[j][2];
+            tjxcoul += -pre2 * (mu[j][1]*delz - mu[j][2]*dely);
+            tjycoul += -pre2 * (mu[j][2]*delx - mu[j][0]*delz);
+            tjzcoul += -pre2 * (mu[j][0]*dely - mu[j][1]*delx);
+          }
+        }
+
+        // LJ interaction
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forceljcut = r6inv*(lj1[itype][jtype]*r6inv-lj2[itype][jtype])*r2inv;
+
+          rcutlj2inv = 1.0 / cut_ljsq[itype][jtype];
+          rcutlj6inv = rcutlj2inv * rcutlj2inv * rcutlj2inv;
+          forceljsf = (lj1[itype][jtype]*rcutlj6inv - lj2[itype][jtype]) *
+            rcutlj6inv * rcutlj2inv;
+
+          forcelj = factor_lj * (forceljcut - forceljsf);
+        } else forcelj = 0.0;
+
+        // total force
+
+        fq = factor_coul*qqrd2e;
+        fx = fq*forcecoulx + delx*forcelj;
+        fy = fq*forcecouly + dely*forcelj;
+        fz = fq*forcecoulz + delz*forcelj;
+
+        // force & torque accumulation
+
+        fxtmp += fx;
+        fytmp += fy;
+        fztmp += fz;
+        t1tmp += fq*tixcoul;
+        t2tmp += fq*tiycoul;
+        t3tmp += fq*tizcoul;
+
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= fx;
+          f[j][1] -= fy;
+          f[j][2] -= fz;
+          torque[j][0] += fq*tjxcoul;
+          torque[j][1] += fq*tjycoul;
+          torque[j][2] += fq*tjzcoul;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq[itype][jtype]) {
+            ecoul = qtmp * q[j] * rinv *
+              pow((1.0-sqrt(rsq)/sqrt(cut_coulsq[itype][jtype])),2.0);
+            if (mu[i][3] > 0.0 && mu[j][3] > 0.0)
+              ecoul += bfac * (r3inv*pdotp - 3.0*r5inv*pidotr*pjdotr);
+            if (mu[i][3] > 0.0 && q[j] != 0.0)
+              ecoul += -q[j] * r3inv * pqfac * pidotr;
+            if (mu[j][3] > 0.0 && qtmp != 0.0)
+              ecoul += qtmp * r3inv * qpfac * pjdotr;
+            ecoul *= factor_coul*qqrd2e;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype])+
+              rcutlj6inv*(6*lj3[itype][jtype]*rcutlj6inv-3*lj4[itype][jtype])*
+              rsq*rcutlj2inv+
+              rcutlj6inv*(-7*lj3[itype][jtype]*rcutlj6inv+4*lj4[itype][jtype]);
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                     evdwl,ecoul,fx,fy,fz,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+    torque[i][0] += t1tmp;
+    torque[i][1] += t2tmp;
+    torque[i][2] += t3tmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairDipoleSFOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairDipoleSF::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_dipole_sf_omp.h b/src/USER-OMP/pair_dipole_sf_omp.h
new file mode 100644
index 0000000000..3ff008cef5
--- /dev/null
+++ b/src/USER-OMP/pair_dipole_sf_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dipole/sf/omp,PairDipoleSFOMP)
+
+#else
+
+#ifndef LMP_PAIR_DIPOLE_SF_OMP_H
+#define LMP_PAIR_DIPOLE_SF_OMP_H
+
+#include "pair_dipole_sf.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairDipoleSFOMP : public PairDipoleSF, public ThrOMP {
+
+ public:
+  PairDipoleSFOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_dpd_omp.cpp b/src/USER-OMP/pair_dpd_omp.cpp
new file mode 100644
index 0000000000..cb7f560e05
--- /dev/null
+++ b/src/USER-OMP/pair_dpd_omp.cpp
@@ -0,0 +1,214 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_dpd_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+#include "random_mars.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EPSILON 1.0e-10
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDOMP::PairDPDOMP(LAMMPS *lmp) :
+  PairDPD(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  random_thr = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDOMP::~PairDPDOMP()
+{
+  if (random_thr) {
+    for (int i=1; i < comm->nthreads; ++i)
+      delete random_thr[i];
+
+    delete[] random_thr;
+    random_thr = NULL;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  if (!random_thr)
+    random_thr = new RanMars*[nthreads];
+
+  // to ensure full compatibility with the serial DPD style
+  // we use is random number generator instance for thread 0
+  random_thr[0] = random;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // generate a random number generator instance for
+    // all threads != 0. make sure we use unique seeds.
+    if (random_thr && tid > 0)
+      random_thr[tid] = new RanMars(Pair::lmp, seed + comm->me
+                                    + comm->nprocs*tid);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairDPDOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double vxtmp,vytmp,vztmp,delvx,delvy,delvz;
+  double rsq,r,rinv,dot,wd,randnum,factor_dpd;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  const double * const * const v = atom->v;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double *special_lj = force->special_lj;
+  const double dtinvsqrt = 1.0/sqrt(update->dt);
+  double fxtmp,fytmp,fztmp;
+  RanMars &rng = *random_thr[thr->get_tid()];
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    vxtmp = v[i][0];
+    vytmp = v[i][1];
+    vztmp = v[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_dpd = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
+        rinv = 1.0/r;
+        delvx = vxtmp - v[j][0];
+        delvy = vytmp - v[j][1];
+        delvz = vztmp - v[j][2];
+        dot = delx*delvx + dely*delvy + delz*delvz;
+        wd = 1.0 - r/cut[itype][jtype];
+        randnum = rng.gaussian();
+
+        // conservative force = a0 * wd
+        // drag force = -gamma * wd^2 * (delx dot delv) / r
+        // random force = sigma * wd * rnd * dtinvsqrt;
+
+        fpair = a0[itype][jtype]*wd;
+        fpair -= gamma[itype][jtype]*wd*wd*dot*rinv;
+        fpair += sigma[itype][jtype]*wd*randnum*dtinvsqrt;
+        fpair *= factor_dpd*rinv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          // unshifted eng of conservative term:
+          // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
+          // eng shifted to 0.0 at cutoff
+          evdwl = 0.5*a0[itype][jtype]*cut[itype][jtype] * wd*wd;
+          evdwl *= factor_dpd;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairDPDOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairDPD::memory_usage();
+  bytes += comm->nthreads * sizeof(RanMars*);
+  bytes += comm->nthreads * sizeof(RanMars);
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_dpd_omp.h b/src/USER-OMP/pair_dpd_omp.h
new file mode 100644
index 0000000000..470e6f1133
--- /dev/null
+++ b/src/USER-OMP/pair_dpd_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/omp,PairDPDOMP)
+
+#else
+
+#ifndef LMP_PAIR_DPD_OMP_H
+#define LMP_PAIR_DPD_OMP_H
+
+#include "pair_dpd.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairDPDOMP : public PairDPD, public ThrOMP {
+
+ public:
+  PairDPDOMP(class LAMMPS *);
+  virtual ~PairDPDOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  class RanMars **random_thr;
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_dpd_tstat_omp.cpp b/src/USER-OMP/pair_dpd_tstat_omp.cpp
new file mode 100644
index 0000000000..6965e69349
--- /dev/null
+++ b/src/USER-OMP/pair_dpd_tstat_omp.cpp
@@ -0,0 +1,214 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_dpd_tstat_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+#include "random_mars.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EPSILON 1.0e-10
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDTstatOMP::PairDPDTstatOMP(LAMMPS *lmp) :
+  PairDPDTstat(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  random_thr = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDTstatOMP::~PairDPDTstatOMP()
+{
+  if (random_thr) {
+    for (int i=1; i < comm->nthreads; ++i)
+      delete random_thr[i];
+
+    delete[] random_thr;
+    random_thr = NULL;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDTstatOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  if (!random_thr)
+    random_thr = new RanMars*[nthreads];
+
+  // to ensure full compatibility with the serial DPD style
+  // we use is random number generator instance for thread 0
+  random_thr[0] = random;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // generate a random number generator instance for
+    // all threads != 0. make sure we use unique seeds.
+    if (random_thr && tid > 0)
+      random_thr[tid] = new RanMars(Pair::lmp, seed + comm->me
+                                    + comm->nprocs*tid);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairDPDTstatOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fpair;
+  double vxtmp,vytmp,vztmp,delvx,delvy,delvz;
+  double rsq,r,rinv,dot,wd,randnum,factor_dpd;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  const double * const * const x = atom->x;
+  const double * const * const v = atom->v;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double *special_lj = force->special_lj;
+  const double dtinvsqrt = 1.0/sqrt(update->dt);
+  double fxtmp,fytmp,fztmp;
+  RanMars &rng = *random_thr[thr->get_tid()];
+
+  // adjust sigma if target T is changing
+
+  if (t_start != t_stop) {
+    double delta = update->ntimestep - update->beginstep;
+    delta /= update->endstep - update->beginstep;
+    temperature = t_start + delta * (t_stop-t_start);
+    double boltz = force->boltz;
+    for (i = 1; i <= atom->ntypes; i++)
+      for (j = i; j <= atom->ntypes; j++)
+        sigma[i][j] = sigma[j][i] = sqrt(2.0*boltz*temperature*gamma[i][j]);
+  }
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    vxtmp = v[i][0];
+    vytmp = v[i][1];
+    vztmp = v[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_dpd = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
+        rinv = 1.0/r;
+        delvx = vxtmp - v[j][0];
+        delvy = vytmp - v[j][1];
+        delvz = vztmp - v[j][2];
+        dot = delx*delvx + dely*delvy + delz*delvz;
+        wd = 1.0 - r/cut[itype][jtype];
+        randnum = rng.gaussian();
+
+        // drag force = -gamma * wd^2 * (delx dot delv) / r
+        // random force = sigma * wd * rnd * dtinvsqrt;
+
+        fpair = -gamma[itype][jtype]*wd*wd*dot*rinv;
+        fpair += sigma[itype][jtype]*wd*randnum*dtinvsqrt;
+        fpair *= factor_dpd*rinv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 0.0,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairDPDTstatOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairDPDTstat::memory_usage();
+  bytes += comm->nthreads * sizeof(RanMars*);
+  bytes += comm->nthreads * sizeof(RanMars);
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_dpd_tstat_omp.h b/src/USER-OMP/pair_dpd_tstat_omp.h
new file mode 100644
index 0000000000..b04874efc2
--- /dev/null
+++ b/src/USER-OMP/pair_dpd_tstat_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/tstat/omp,PairDPDTstatOMP)
+
+#else
+
+#ifndef LMP_PAIR_DPD_TSTAT_OMP_H
+#define LMP_PAIR_DPD_TSTAT_OMP_H
+
+#include "pair_dpd_tstat.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairDPDTstatOMP : public PairDPDTstat, public ThrOMP {
+
+ public:
+  PairDPDTstatOMP(class LAMMPS *);
+  virtual ~PairDPDTstatOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  class RanMars **random_thr;
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_eam_alloy_omp.cpp b/src/USER-OMP/pair_eam_alloy_omp.cpp
new file mode 100644
index 0000000000..3dff868b6a
--- /dev/null
+++ b/src/USER-OMP/pair_eam_alloy_omp.cpp
@@ -0,0 +1,323 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
+------------------------------------------------------------------------- */
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_eam_alloy_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMAlloyOMP::PairEAMAlloyOMP(LAMMPS *lmp) : PairEAMOMP(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyOMP::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM setfl file
+
+  if (setfl) {
+    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
+    delete [] setfl->elements;
+    delete [] setfl->mass;
+    memory->destroy(setfl->frho);
+    memory->destroy(setfl->rhor);
+    memory->destroy(setfl->z2r);
+    delete setfl;
+  }
+  setfl = new Setfl();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < setfl->nelements; j++)
+      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
+    if (j < setfl->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(i,setfl->mass[map[i]]);
+        count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyOMP::read_file(char *filename)
+{
+  Setfl *file = setfl;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = fopen(filename,"r");
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
+  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
+                 "pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
+    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyOMP::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from setfl file
+
+  nrho = setfl->nrho;
+  nr = setfl->nr;
+  drho = setfl->drho;
+  dr = setfl->dr;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of setfl elements + 1 for zero array
+
+  nfrho = setfl->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = # of setfl elements
+
+  nrhor = setfl->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element's rhor to global rhor
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for setfl files, I,J mapping only depends on I
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of setfl elements
+
+  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < setfl->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-OMP/pair_eam_alloy_omp.h b/src/USER-OMP/pair_eam_alloy_omp.h
new file mode 100644
index 0000000000..a044b18f60
--- /dev/null
+++ b/src/USER-OMP/pair_eam_alloy_omp.h
@@ -0,0 +1,43 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/alloy/omp,PairEAMAlloyOMP)
+
+#else
+
+#ifndef LMP_PAIR_EAM_ALLOY_OMP_H
+#define LMP_PAIR_EAM_ALLOY_OMP_H
+
+#include "pair_eam_omp.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/alloy/opt inherits from it
+
+class PairEAMAlloyOMP : virtual public PairEAMOMP {
+ public:
+  PairEAMAlloyOMP(class LAMMPS *);
+  virtual ~PairEAMAlloyOMP() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_eam_fs_omp.cpp b/src/USER-OMP/pair_eam_fs_omp.cpp
new file mode 100644
index 0000000000..84101913c5
--- /dev/null
+++ b/src/USER-OMP/pair_eam_fs_omp.cpp
@@ -0,0 +1,332 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Tim Lau (MIT)
+------------------------------------------------------------------------- */
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_eam_fs_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMFSOMP::PairEAMFSOMP(LAMMPS *lmp) : PairEAMOMP(lmp)
+{
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read EAM Finnis-Sinclair file
+------------------------------------------------------------------------- */
+
+void PairEAMFSOMP::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  // read EAM Finnis-Sinclair file
+
+  if (fs) {
+    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
+    delete [] fs->elements;
+    delete [] fs->mass;
+    memory->destroy(fs->frho);
+    memory->destroy(fs->rhor);
+    memory->destroy(fs->z2r);
+    delete fs;
+  }
+  fs = new Fs();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < fs->nelements; j++)
+      if (strcmp(arg[i],fs->elements[j]) == 0) break;
+    if (j < fs->nelements) map[i-2] = j;
+    else error->all(FLERR,"No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+        setflag[i][j] = 1;
+        if (i == j) atom->set_mass(i,fs->mass[map[i]]);
+        count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMFSOMP::read_file(char *filename)
+{
+  Fs *file = fs;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = fopen(filename,"r");
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all(FLERR,"Incorrect element names in EAM potential file");
+
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  strtok(line," \t\n\r\f");
+  while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+           &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,
+                                              "pair:frho");
+  memory->create(file->rhor,file->nelements,file->nelements,
+                 file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,
+                 file->nr+1,"pair:z2r");
+
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+
+    for (j = 0; j < file->nelements; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
+      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMFSOMP::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from fs file
+
+  nrho = fs->nrho;
+  nr = fs->nr;
+  drho = fs->drho;
+  dr = fs->dr;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of fs elements + 1 for zero array
+
+  nfrho = fs->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < fs->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = square of # of fs elements
+
+  nrhor = fs->nelements * fs->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element pair rhor to global rhor
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j < fs->nelements; j++) {
+      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
+      n++;
+    }
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for fs files, there is a full NxN set of rhor arrays
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i] * fs->nelements + map[j];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of fs elements
+
+  nz2r = fs->nelements * (fs->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+        type2z2r[i][j] = 0;
+        continue;
+      }
+      if (irow < icol) {
+        irow = map[j];
+        icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
diff --git a/src/USER-OMP/pair_eam_fs_omp.h b/src/USER-OMP/pair_eam_fs_omp.h
new file mode 100644
index 0000000000..42d4d59eb6
--- /dev/null
+++ b/src/USER-OMP/pair_eam_fs_omp.h
@@ -0,0 +1,43 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/fs/omp,PairEAMFSOMP)
+
+#else
+
+#ifndef LMP_PAIR_EAM_FS_OMP_H
+#define LMP_PAIR_EAM_FS_OMP_H
+
+#include "pair_eam_omp.h"
+
+namespace LAMMPS_NS {
+
+// need virtual public b/c of how eam/fs/opt inherits from it
+
+class PairEAMFSOMP : virtual public PairEAMOMP {
+ public:
+  PairEAMFSOMP(class LAMMPS *);
+  virtual ~PairEAMFSOMP() {}
+  void coeff(int, char **);
+
+ protected:
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_eam_omp.cpp b/src/USER-OMP/pair_eam_omp.cpp
new file mode 100644
index 0000000000..59e58d1b73
--- /dev/null
+++ b/src/USER-OMP/pair_eam_omp.cpp
@@ -0,0 +1,303 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "string.h"
+
+#include "pair_eam_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMOMP::PairEAMOMP(LAMMPS *lmp) :
+  PairEAM(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // grow energy and fp arrays if necessary
+  // need to be atom->nmax in length
+
+  if (atom->nmax > nmax) {
+    memory->destroy(rho);
+    memory->destroy(fp);
+    nmax = atom->nmax;
+    memory->create(rho,nthreads*nmax,"pair:rho");
+    memory->create(fp,nmax,"pair:fp");
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (force->newton_pair)
+      thr->init_eam(nall, rho);
+    else
+      thr->init_eam(atom->nlocal, rho);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairEAMOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,m,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,p,rhoip,rhojp,z2,z2p,recip,phip,psip,phi;
+  double *coeff;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const rho_t = thr->get_rho();
+  const int tid = thr->get_tid();
+  const int nthreads = comm->nthreads;
+
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        jtype = type[j];
+        p = sqrt(rsq)*rdr + 1.0;
+        m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+        coeff = rhor_spline[type2rhor[jtype][itype]][m];
+        rho_t[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        if (NEWTON_PAIR || j < nlocal) {
+          coeff = rhor_spline[type2rhor[itype][jtype]][m];
+          rho_t[j] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        }
+      }
+    }
+  }
+
+  // wait until all threads are done with computation
+  sync_threads();
+
+  // communicate and sum densities
+
+  if (NEWTON_PAIR) {
+    // reduce per thread density
+    data_reduce_thr(rho, nall, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    { comm->reverse_comm_pair(this); }
+
+    // wait until master thread is done with communication
+    sync_threads();
+
+  } else {
+    data_reduce_thr(rho, nlocal, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+  }
+
+  // fp = derivative of embedding energy at each atom
+  // phi = embedding energy at each atom
+  // if rho > rhomax (e.g. due to close approach of two atoms),
+  //   will exceed table, so add linear term to conserve energy
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    p = rho[i]*rdrho + 1.0;
+    m = static_cast<int> (p);
+    m = MAX(1,MIN(m,nrho-1));
+    p -= m;
+    p = MIN(p,1.0);
+    coeff = frho_spline[type2frho[type[i]]][m];
+    fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
+    if (EFLAG) {
+      phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+      if (rho[i] > rhomax) phi += fp[i] * (rho[i]-rhomax);
+      e_tally_thr(this, i, i, nlocal, NEWTON_PAIR, phi, 0.0, thr);
+    }
+  }
+
+  // wait until all theads are done with computation
+  sync_threads();
+
+  // communicate derivative of embedding function
+  // MPI communication only on master thread
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+  { comm->forward_comm_pair(this); }
+
+  // wait until master thread is done with communication
+  sync_threads();
+
+  // compute forces on each atom
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq) {
+        jtype = type[j];
+        r = sqrt(rsq);
+        p = r*rdr + 1.0;
+        m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+
+        // rhoip = derivative of (density at atom j due to atom i)
+        // rhojp = derivative of (density at atom i due to atom j)
+        // phi = pair potential energy
+        // phip = phi'
+        // z2 = phi * r
+        // z2p = (phi * r)' = (phi' r) + phi
+        // psip needs both fp[i] and fp[j] terms since r_ij appears in two
+        //   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+        //   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+
+        coeff = rhor_spline[type2rhor[itype][jtype]][m];
+        rhoip = (coeff[0]*p + coeff[1])*p + coeff[2];
+        coeff = rhor_spline[type2rhor[jtype][itype]][m];
+        rhojp = (coeff[0]*p + coeff[1])*p + coeff[2];
+        coeff = z2r_spline[type2z2r[itype][jtype]][m];
+        z2p = (coeff[0]*p + coeff[1])*p + coeff[2];
+        z2 = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+
+        recip = 1.0/r;
+        phi = z2*recip;
+        phip = z2p*recip - phi*recip;
+        psip = fp[i]*rhojp + fp[j]*rhoip + phip;
+        fpair = -psip*recip;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) evdwl = phi;
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairEAMOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairEAM::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_eam_omp.h b/src/USER-OMP/pair_eam_omp.h
new file mode 100644
index 0000000000..0f9d3cd51e
--- /dev/null
+++ b/src/USER-OMP/pair_eam_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/omp,PairEAMOMP)
+
+#else
+
+#ifndef LMP_PAIR_EAM_OMP_H
+#define LMP_PAIR_EAM_OMP_H
+
+#include "pair_eam.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairEAMOMP : public PairEAM, public ThrOMP {
+
+ public:
+  PairEAMOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int iifrom, int iito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_edip_omp.cpp b/src/USER-OMP/pair_edip_omp.cpp
new file mode 100644
index 0000000000..d9d68d7a07
--- /dev/null
+++ b/src/USER-OMP/pair_edip_omp.cpp
@@ -0,0 +1,492 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_edip_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define GRIDDENSITY 8000
+#define GRIDSTART 0.1
+
+// max number of interaction per atom for f(Z) environment potential
+
+#define leadDimInteractionList 64
+
+/* ---------------------------------------------------------------------- */
+
+PairEDIPOMP::PairEDIPOMP(LAMMPS *lmp) :
+  PairEDIP(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEDIPOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = vflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (vflag_atom) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (vflag_atom) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else eval<0,0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+void PairEDIPOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,ii,inum,jnum;
+  int itype,jtype,ktype,ijparam,ikparam,ijkparam;
+  double xtmp,ytmp,ztmp,evdwl;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  register int preForceCoord_counter;
+
+  double invR_ij;
+  double invR_ik;
+  double directorCos_ij_x;
+  double directorCos_ij_y;
+  double directorCos_ij_z;
+  double directorCos_ik_x;
+  double directorCos_ik_y;
+  double directorCos_ik_z;
+  double cosTeta;
+
+  int interpolIDX;
+  double interpolTMP;
+  double interpolDeltaX;
+  double interpolY1;
+  double interpolY2;
+
+  double invRMinusCutoffA;
+  double sigmaInvRMinusCutoffA;
+  double gammInvRMinusCutoffA;
+  double cosTetaDiff;
+  double cosTetaDiffCosTetaDiff;
+  double cutoffFunction_ij;
+  double exp2B_ij;
+  double exp2BDerived_ij;
+  double pow2B_ij;
+  double pow2BDerived_ij;
+  double exp3B_ij;
+  double exp3BDerived_ij;
+  double exp3B_ik;
+  double exp3BDerived_ik;
+  double qFunction;
+  double qFunctionDerived;
+  double tauFunction;
+  double tauFunctionDerived;
+  double expMinusBetaZeta_iZeta_i;
+  double qFunctionCosTetaDiffCosTetaDiff;
+  double expMinusQFunctionCosTetaDiffCosTetaDiff;
+  double zeta_i;
+  double zeta_iDerived;
+  double zeta_iDerivedInvR_ij;
+
+  double forceModCoord_factor;
+  double forceModCoord;
+  double forceModCoord_ij;
+  double forceMod2B;
+  double forceMod3B_factor1_ij;
+  double forceMod3B_factor2_ij;
+  double forceMod3B_factor2;
+  double forceMod3B_factor1_ik;
+  double forceMod3B_factor2_ik;
+  double potentia3B_factor;
+  double potential2B_factor;
+
+  const int tid = thr->get_tid();
+
+  double *pre_thrInvR_ij = preInvR_ij + tid * leadDimInteractionList;
+  double *pre_thrExp3B_ij = preExp3B_ij + tid * leadDimInteractionList;
+  double *pre_thrExp3BDerived_ij = preExp3BDerived_ij + tid * leadDimInteractionList;
+  double *pre_thrExp2B_ij = preExp2B_ij + tid * leadDimInteractionList;
+  double *pre_thrExp2BDerived_ij = preExp2BDerived_ij + tid * leadDimInteractionList;
+  double *pre_thrPow2B_ij = prePow2B_ij + tid * leadDimInteractionList;
+  double *pre_thrForceCoord = preForceCoord + tid * leadDimInteractionList;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over full neighbor list of my atoms
+
+  for (ii = iifrom; ii < iito; ii++) {
+    zeta_i = 0.0;
+    int numForceCoordPairs = 0;
+
+    i = ilist[ii];
+    itype = map[type[i]];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    // pre-loop to compute environment coordination f(Z)
+
+    for (int neighbor_j = 0; neighbor_j < jnum; neighbor_j++) {
+        j = jlist[neighbor_j];
+        j &= NEIGHMASK;
+
+        double dr_ij[3], r_ij;
+
+        dr_ij[0] = xtmp - x[j][0];
+        dr_ij[1] = ytmp - x[j][1];
+        dr_ij[2] = ztmp - x[j][2];
+        r_ij = dr_ij[0]*dr_ij[0] + dr_ij[1]*dr_ij[1] + dr_ij[2]*dr_ij[2];
+
+        jtype = map[type[j]];
+        ijparam = elem2param[itype][jtype][jtype];
+        if (r_ij > params[ijparam].cutsq) continue;
+
+        r_ij = sqrt(r_ij);
+
+        invR_ij = 1.0 / r_ij;
+        pre_thrInvR_ij[neighbor_j] = invR_ij;
+
+        invRMinusCutoffA =  1.0 / (r_ij - cutoffA);
+        sigmaInvRMinusCutoffA = sigma * invRMinusCutoffA;
+        gammInvRMinusCutoffA = gamm * invRMinusCutoffA;
+
+        interpolDeltaX = r_ij - GRIDSTART;
+        interpolTMP = (interpolDeltaX * GRIDDENSITY);
+        interpolIDX = (int) interpolTMP;
+
+        interpolY1 = exp3B[interpolIDX];
+        interpolY2 = exp3B[interpolIDX+1];
+        exp3B_ij = interpolY1 + (interpolY2 - interpolY1) *
+          (interpolTMP-interpolIDX);
+
+        exp3BDerived_ij = - exp3B_ij * gammInvRMinusCutoffA * invRMinusCutoffA;
+
+        pre_thrExp3B_ij[neighbor_j] = exp3B_ij;
+        pre_thrExp3BDerived_ij[neighbor_j] = exp3BDerived_ij;
+
+        interpolY1 = exp2B[interpolIDX];
+        interpolY2 = exp2B[interpolIDX+1];
+        exp2B_ij = interpolY1 + (interpolY2 - interpolY1) *
+          (interpolTMP-interpolIDX);
+
+        exp2BDerived_ij = - exp2B_ij * sigmaInvRMinusCutoffA * invRMinusCutoffA;
+
+        pre_thrExp2B_ij[neighbor_j] = exp2B_ij;
+        pre_thrExp2BDerived_ij[neighbor_j] = exp2BDerived_ij;
+
+        interpolY1 = pow2B[interpolIDX];
+        interpolY2 = pow2B[interpolIDX+1];
+        pow2B_ij = interpolY1 + (interpolY2 - interpolY1) *
+          (interpolTMP-interpolIDX);
+
+        pre_thrPow2B_ij[neighbor_j] = pow2B_ij;
+
+        // zeta and its derivative
+
+        if (r_ij < cutoffC) zeta_i += 1.0;
+        else {
+            interpolY1 = cutoffFunction[interpolIDX];
+            interpolY2 = cutoffFunction[interpolIDX+1];
+            cutoffFunction_ij = interpolY1 + (interpolY2 - interpolY1) *
+              (interpolTMP-interpolIDX);
+
+            zeta_i += cutoffFunction_ij;
+
+            interpolY1 = cutoffFunctionDerived[interpolIDX];
+            interpolY2 = cutoffFunctionDerived[interpolIDX+1];
+            zeta_iDerived = interpolY1 + (interpolY2 - interpolY1) *
+              (interpolTMP-interpolIDX);
+
+            zeta_iDerivedInvR_ij = zeta_iDerived * invR_ij;
+
+            preForceCoord_counter=numForceCoordPairs*5;
+            pre_thrForceCoord[preForceCoord_counter+0]=zeta_iDerivedInvR_ij;
+            pre_thrForceCoord[preForceCoord_counter+1]=dr_ij[0];
+            pre_thrForceCoord[preForceCoord_counter+2]=dr_ij[1];
+            pre_thrForceCoord[preForceCoord_counter+3]=dr_ij[2];
+            pre_thrForceCoord[preForceCoord_counter+4]=j;
+            numForceCoordPairs++;
+        }
+    }
+
+    // quantities depending on zeta_i
+
+    interpolDeltaX = zeta_i;
+    interpolTMP = (interpolDeltaX * GRIDDENSITY);
+    interpolIDX = (int) interpolTMP;
+
+    interpolY1 = expMinusBetaZeta_iZeta_iGrid[interpolIDX];
+    interpolY2 = expMinusBetaZeta_iZeta_iGrid[interpolIDX+1];
+    expMinusBetaZeta_iZeta_i = interpolY1 + (interpolY2 - interpolY1) *
+      (interpolTMP-interpolIDX);
+
+    interpolY1 = qFunctionGrid[interpolIDX];
+    interpolY2 = qFunctionGrid[interpolIDX+1];
+    qFunction = interpolY1 + (interpolY2 - interpolY1) *
+      (interpolTMP-interpolIDX);
+
+    interpolY1 = tauFunctionGrid[interpolIDX];
+    interpolY2 = tauFunctionGrid[interpolIDX+1];
+    tauFunction = interpolY1 + (interpolY2 - interpolY1) *
+      (interpolTMP-interpolIDX);
+
+    interpolY1 = tauFunctionDerivedGrid[interpolIDX];
+    interpolY2 = tauFunctionDerivedGrid[interpolIDX+1];
+    tauFunctionDerived = interpolY1 + (interpolY2 - interpolY1) *
+      (interpolTMP-interpolIDX);
+
+    qFunctionDerived = -mu * qFunction;
+
+    forceModCoord_factor = 2.0 * beta * zeta_i * expMinusBetaZeta_iZeta_i;
+
+    forceModCoord = 0.0;
+
+    // two-body interactions, skip half of them
+
+    for (int neighbor_j = 0; neighbor_j < jnum; neighbor_j++) {
+      double dr_ij[3], r_ij, f_ij[3];
+
+      j = jlist[neighbor_j];
+      j &= NEIGHMASK;
+
+      dr_ij[0] = x[j][0] - xtmp;
+      dr_ij[1] = x[j][1] - ytmp;
+      dr_ij[2] = x[j][2] - ztmp;
+      r_ij = dr_ij[0]*dr_ij[0] + dr_ij[1]*dr_ij[1] + dr_ij[2]*dr_ij[2];
+
+      jtype = map[type[j]];
+      ijparam = elem2param[itype][jtype][jtype];
+      if (r_ij > params[ijparam].cutsq) continue;
+
+      r_ij = sqrt(r_ij);
+
+      invR_ij = pre_thrInvR_ij[neighbor_j];
+      pow2B_ij = pre_thrPow2B_ij[neighbor_j];
+
+      potential2B_factor = pow2B_ij - expMinusBetaZeta_iZeta_i;
+
+      exp2B_ij = pre_thrExp2B_ij[neighbor_j];
+
+      pow2BDerived_ij = - rho * invR_ij * pow2B_ij;
+
+      forceModCoord += (forceModCoord_factor*exp2B_ij);
+
+      exp2BDerived_ij = pre_thrExp2BDerived_ij[neighbor_j];
+      forceMod2B = exp2BDerived_ij * potential2B_factor +
+        exp2B_ij * pow2BDerived_ij;
+
+      directorCos_ij_x = invR_ij * dr_ij[0];
+      directorCos_ij_y = invR_ij * dr_ij[1];
+      directorCos_ij_z = invR_ij * dr_ij[2];
+
+      exp3B_ij = pre_thrExp3B_ij[neighbor_j];
+      exp3BDerived_ij = pre_thrExp3BDerived_ij[neighbor_j];
+
+      f_ij[0] = forceMod2B * directorCos_ij_x;
+      f_ij[1] = forceMod2B * directorCos_ij_y;
+      f_ij[2] = forceMod2B * directorCos_ij_z;
+
+      f[j][0] -= f_ij[0];
+      f[j][1] -= f_ij[1];
+      f[j][2] -= f_ij[2];
+
+      f[i][0] += f_ij[0];
+      f[i][1] += f_ij[1];
+      f[i][2] += f_ij[2];
+
+      // potential energy
+
+      evdwl = (exp2B_ij * potential2B_factor);
+
+      if (EVFLAG) ev_tally_thr(this,i, j, nlocal, /* newton_pair */ 1, evdwl, 0.0,
+                               -forceMod2B*invR_ij, dr_ij[0], dr_ij[1], dr_ij[2],thr);
+
+      // three-body Forces
+
+      for (int neighbor_k = neighbor_j + 1; neighbor_k < jnum; neighbor_k++) {
+          double dr_ik[3], r_ik, f_ik[3];
+
+          k = jlist[neighbor_k];
+          k &= NEIGHMASK;
+          ktype = map[type[k]];
+          ikparam = elem2param[itype][ktype][ktype];
+          ijkparam = elem2param[itype][jtype][ktype];
+
+          dr_ik[0] = x[k][0] - xtmp;
+          dr_ik[1] = x[k][1] - ytmp;
+          dr_ik[2] = x[k][2] - ztmp;
+          r_ik = dr_ik[0]*dr_ik[0] + dr_ik[1]*dr_ik[1] + dr_ik[2]*dr_ik[2];
+
+          if (r_ik > params[ikparam].cutsq) continue;
+
+          r_ik = sqrt(r_ik);
+
+          invR_ik = pre_thrInvR_ij[neighbor_k];
+
+          directorCos_ik_x = invR_ik * dr_ik[0];
+          directorCos_ik_y = invR_ik * dr_ik[1];
+          directorCos_ik_z = invR_ik * dr_ik[2];
+
+          cosTeta = directorCos_ij_x * directorCos_ik_x +
+            directorCos_ij_y * directorCos_ik_y +
+            directorCos_ij_z * directorCos_ik_z;
+
+          cosTetaDiff = cosTeta + tauFunction;
+          cosTetaDiffCosTetaDiff = cosTetaDiff * cosTetaDiff;
+          qFunctionCosTetaDiffCosTetaDiff = cosTetaDiffCosTetaDiff * qFunction;
+          expMinusQFunctionCosTetaDiffCosTetaDiff =
+            exp(-qFunctionCosTetaDiffCosTetaDiff);
+
+          potentia3B_factor = lambda *
+            ((1.0 - expMinusQFunctionCosTetaDiffCosTetaDiff) +
+             eta * qFunctionCosTetaDiffCosTetaDiff);
+
+          exp3B_ik = pre_thrExp3B_ij[neighbor_k];
+          exp3BDerived_ik = pre_thrExp3BDerived_ij[neighbor_k];
+
+          forceMod3B_factor1_ij = - exp3BDerived_ij * exp3B_ik *
+            potentia3B_factor;
+          forceMod3B_factor2 = 2.0 * lambda * exp3B_ij * exp3B_ik *
+            qFunction * cosTetaDiff *
+            (eta + expMinusQFunctionCosTetaDiffCosTetaDiff);
+          forceMod3B_factor2_ij = forceMod3B_factor2 * invR_ij;
+
+          f_ij[0] = forceMod3B_factor1_ij * directorCos_ij_x +
+            forceMod3B_factor2_ij *
+            (cosTeta * directorCos_ij_x - directorCos_ik_x);
+          f_ij[1] = forceMod3B_factor1_ij * directorCos_ij_y +
+            forceMod3B_factor2_ij *
+            (cosTeta * directorCos_ij_y - directorCos_ik_y);
+          f_ij[2] = forceMod3B_factor1_ij * directorCos_ij_z +
+            forceMod3B_factor2_ij *
+            (cosTeta * directorCos_ij_z - directorCos_ik_z);
+
+          forceMod3B_factor1_ik = - exp3BDerived_ik * exp3B_ij *
+            potentia3B_factor;
+          forceMod3B_factor2_ik = forceMod3B_factor2 * invR_ik;
+
+          f_ik[0] = forceMod3B_factor1_ik * directorCos_ik_x +
+            forceMod3B_factor2_ik *
+            (cosTeta * directorCos_ik_x - directorCos_ij_x);
+          f_ik[1] = forceMod3B_factor1_ik * directorCos_ik_y +
+            forceMod3B_factor2_ik *
+            (cosTeta * directorCos_ik_y - directorCos_ij_y);
+          f_ik[2] = forceMod3B_factor1_ik * directorCos_ik_z +
+            forceMod3B_factor2_ik *
+            (cosTeta * directorCos_ik_z - directorCos_ij_z);
+
+          forceModCoord += (forceMod3B_factor2 *
+                            (tauFunctionDerived -  0.5 * mu * cosTetaDiff));
+
+          f[j][0] += f_ij[0];
+          f[j][1] += f_ij[1];
+          f[j][2] += f_ij[2];
+
+          f[k][0] += f_ik[0];
+          f[k][1] += f_ik[1];
+          f[k][2] += f_ik[2];
+
+          f[i][0] -= f_ij[0] + f_ik[0];
+          f[i][1] -= f_ij[1] + f_ik[1];
+          f[i][2] -= f_ij[2] + f_ik[2];
+
+          // potential energy
+
+          evdwl = (exp3B_ij * exp3B_ik * potentia3B_factor);
+
+          if (evflag) ev_tally3_thr(this,i,j,k,evdwl,0.0,f_ij,f_ik,dr_ij,dr_ik,thr);
+      }
+    }
+
+    // forces due to environment coordination f(Z)
+
+    for (int idx = 0; idx < numForceCoordPairs; idx++) {
+        double dr_ij[3], f_ij[3];
+
+        preForceCoord_counter = idx * 5;
+        zeta_iDerivedInvR_ij=pre_thrForceCoord[preForceCoord_counter+0];
+        dr_ij[0]=pre_thrForceCoord[preForceCoord_counter+1];
+        dr_ij[1]=pre_thrForceCoord[preForceCoord_counter+2];
+        dr_ij[2]=pre_thrForceCoord[preForceCoord_counter+3];
+        j = static_cast<int> (pre_thrForceCoord[preForceCoord_counter+4]);
+
+        forceModCoord_ij = forceModCoord * zeta_iDerivedInvR_ij;
+
+        f_ij[0] = forceModCoord_ij * dr_ij[0];
+        f_ij[1] = forceModCoord_ij * dr_ij[1];
+        f_ij[2] = forceModCoord_ij * dr_ij[2];
+
+        f[j][0] -= f_ij[0];
+        f[j][1] -= f_ij[1];
+        f[j][2] -= f_ij[2];
+
+        f[i][0] += f_ij[0];
+        f[i][1] += f_ij[1];
+        f[i][2] += f_ij[2];
+
+        // potential energy
+
+        evdwl = 0.0;
+        if (EVFLAG) ev_tally_thr(this,i, j, nlocal, /* newton_pair */ 1, 0.0, 0.0,
+                                 forceModCoord_ij, dr_ij[0], dr_ij[1], dr_ij[2],thr);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairEDIPOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairEDIP::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_edip_omp.h b/src/USER-OMP/pair_edip_omp.h
new file mode 100644
index 0000000000..55e10c83bb
--- /dev/null
+++ b/src/USER-OMP/pair_edip_omp.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(edip/omp,PairEDIPOMP)
+
+#else
+
+#ifndef LMP_PAIR_EDIP_OMP_H
+#define LMP_PAIR_EDIP_OMP_H
+
+#include "pair_edip.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairEDIPOMP : public PairEDIP, public ThrOMP {
+
+ public:
+  PairEDIPOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_eim_omp.cpp b/src/USER-OMP/pair_eim_omp.cpp
new file mode 100644
index 0000000000..ad0909f242
--- /dev/null
+++ b/src/USER-OMP/pair_eim_omp.cpp
@@ -0,0 +1,355 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "string.h"
+
+#include "pair_eim_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairEIMOMP::PairEIMOMP(LAMMPS *lmp) :
+  PairEIM(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEIMOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // grow energy and fp arrays if necessary
+  // need to be atom->nmax in length
+
+  if (atom->nmax > nmax) {
+    memory->destroy(rho);
+    memory->destroy(fp);
+    nmax = atom->nmax;
+    memory->create(rho,nthreads*nmax,"pair:rho");
+    memory->create(fp,nthreads*nmax,"pair:fp");
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (force->newton_pair)
+      thr->init_eim(nall, rho, fp);
+    else
+      thr->init_eim(atom->nlocal, rho, fp);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairEIMOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,m,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,p,rhoip,rhojp,phip,phi,coul,coulp,recip,psip;
+  double *coeff;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const rho_t = thr->get_rho();
+  double * const fp_t = thr->get_fp();
+  const int tid = thr->get_tid();
+  const int nthreads = comm->nthreads;
+
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq[itype][jtype]) {
+        p = sqrt(rsq)*rdr + 1.0;
+        m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+        coeff = Fij_spline[type2Fij[itype][jtype]][m];
+        rho_t[i] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        if (NEWTON_PAIR || j < nlocal) {
+          coeff = Fij_spline[type2Fij[jtype][itype]][m];
+          rho_t[j] += ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        }
+      }
+    }
+  }
+
+  // wait until all threads are done with computation
+  sync_threads();
+
+  // communicate and sum densities
+  if (NEWTON_PAIR) {
+    // reduce per thread density
+    data_reduce_thr(rho, nall, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    {
+      rhofp = 1;
+      comm->reverse_comm_pair(this);
+    }
+
+  } else {
+    data_reduce_thr(rho, nlocal, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+  }
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+  {
+    rhofp = 1;
+    comm->forward_comm_pair(this);
+  }
+
+  // wait until master is finished communicating
+  sync_threads();
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq[itype][jtype]) {
+        p = sqrt(rsq)*rdr + 1.0;
+        m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+        coeff = Gij_spline[type2Gij[itype][jtype]][m];
+        fp_t[i] += rho[j]*(((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6]);
+        if (NEWTON_PAIR || j < nlocal) {
+          fp_t[j] += rho[i]*(((coeff[3]*p + coeff[4])*p + coeff[5])*p +
+                           coeff[6]);
+        }
+      }
+    }
+  }
+
+  // wait until all threads are done with computation
+  sync_threads();
+
+  // communicate and sum modified densities
+  if (NEWTON_PAIR) {
+    // reduce per thread density
+    data_reduce_thr(fp, nall, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    {
+      rhofp = 2;
+      comm->reverse_comm_pair(this);
+    }
+
+  } else {
+    data_reduce_thr(fp, nlocal, nthreads, 1, tid);
+
+    // wait until reduction is complete
+    sync_threads();
+  }
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+  {
+    rhofp = 2;
+    comm->forward_comm_pair(this);
+  }
+
+  // wait until master is finished communicating
+  sync_threads();
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    itype = type[i];
+    if (EFLAG) {
+      phi = 0.5*rho[i]*fp[i];
+      e_tally_thr(this, i, i, nlocal, NEWTON_PAIR, phi, 0.0, thr);
+    }
+  }
+
+  // compute forces on each atom
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cutforcesq[itype][jtype]) {
+        r = sqrt(rsq);
+        p = r*rdr + 1.0;
+        m = static_cast<int> (p);
+        m = MIN(m,nr-1);
+        p -= m;
+        p = MIN(p,1.0);
+
+        // rhoip = derivative of (density at atom j due to atom i)
+        // rhojp = derivative of (density at atom i due to atom j)
+        // phi = pair potential energy
+        // phip = phi'
+
+        coeff = Fij_spline[type2Fij[jtype][itype]][m];
+        rhoip = (coeff[0]*p + coeff[1])*p + coeff[2];
+        coeff = Fij_spline[type2Fij[itype][jtype]][m];
+        rhojp = (coeff[0]*p + coeff[1])*p + coeff[2];
+        coeff = phiij_spline[type2phiij[itype][jtype]][m];
+        phip = (coeff[0]*p + coeff[1])*p + coeff[2];
+        phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        coeff = Gij_spline[type2Gij[itype][jtype]][m];
+        coul = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+        coulp = (coeff[0]*p + coeff[1])*p + coeff[2];
+        psip = phip + (rho[i]*rho[j]-q0[itype]*q0[jtype])*coulp +
+               fp[i]*rhojp + fp[j]*rhoip;
+        recip = 1.0/r;
+        fpair = -psip*recip;
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) evdwl = phi-q0[itype]*q0[jtype]*coul;
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairEIMOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairEIM::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_eim_omp.h b/src/USER-OMP/pair_eim_omp.h
new file mode 100644
index 0000000000..c63222d7fb
--- /dev/null
+++ b/src/USER-OMP/pair_eim_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eim/omp,PairEIMOMP)
+
+#else
+
+#ifndef LMP_PAIR_EIM_OMP_H
+#define LMP_PAIR_EIM_OMP_H
+
+#include "pair_eim.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairEIMOMP : public PairEIM, public ThrOMP {
+
+ public:
+  PairEIMOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int iifrom, int iito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_gauss_cut_omp.cpp b/src/USER-OMP/pair_gauss_cut_omp.cpp
new file mode 100644
index 0000000000..cb54785627
--- /dev/null
+++ b/src/USER-OMP/pair_gauss_cut_omp.cpp
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gauss_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairGaussCutOMP::PairGaussCutOMP(LAMMPS *lmp) :
+  PairGaussCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGaussCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairGaussCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,rexp,ugauss,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        rexp = (r-rmh[itype][jtype])/sigmah[itype][jtype];
+        ugauss = pgauss[itype][jtype]*exp(-0.5*rexp*rexp);
+        fpair = factor_lj*rexp/r*ugauss/sigmah[itype][jtype];
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = ugauss - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGaussCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairGaussCut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_gauss_cut_omp.h b/src/USER-OMP/pair_gauss_cut_omp.h
new file mode 100644
index 0000000000..2da2c14fdb
--- /dev/null
+++ b/src/USER-OMP/pair_gauss_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gauss/cut/omp,PairGaussCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_GAUSS_CUT_OMP_H
+#define LMP_PAIR_GAUSS_CUT_OMP_H
+
+#include "pair_gauss_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairGaussCutOMP : public PairGaussCut, public ThrOMP {
+
+ public:
+  PairGaussCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_gauss_omp.cpp b/src/USER-OMP/pair_gauss_omp.cpp
new file mode 100644
index 0000000000..ae8af0d774
--- /dev/null
+++ b/src/USER-OMP/pair_gauss_omp.cpp
@@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gauss_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EPSILON 1.0e-10
+/* ---------------------------------------------------------------------- */
+
+PairGaussOMP::PairGaussOMP(LAMMPS *lmp) :
+  PairGauss(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGaussOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+  double occ = 0.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag) reduction(+:occ)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) occ = eval<1,1,1>(ifrom, ito, thr);
+        else occ = eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) occ = eval<1,0,1>(ifrom, ito, thr);
+        else occ = eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) occ = eval<0,0,1>(ifrom, ito, thr);
+      else occ = eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+
+  if (eflag_global) pvector[0] = occ;
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+double PairGaussOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  int occ = 0;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      // define a Gaussian well to be occupied if
+      // the site it interacts with is within the force maximum
+
+      if (EFLAG)
+        if (eflag_global && rsq < 0.5/b[itype][jtype]) occ++;
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        forcelj = - 2.0*a[itype][jtype]*b[itype][jtype] * rsq *
+          exp(-b[itype][jtype]*rsq);
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = -(a[itype][jtype]*exp(-b[itype][jtype]*rsq) -
+                    offset[itype][jtype]);
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+  return occ;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGaussOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairGauss::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_gauss_omp.h b/src/USER-OMP/pair_gauss_omp.h
new file mode 100644
index 0000000000..a65a98730a
--- /dev/null
+++ b/src/USER-OMP/pair_gauss_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gauss/omp,PairGaussOMP)
+
+#else
+
+#ifndef LMP_PAIR_GAUSS_OMP_H
+#define LMP_PAIR_GAUSS_OMP_H
+
+#include "pair_gauss.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairGaussOMP : public PairGauss, public ThrOMP {
+
+ public:
+  PairGaussOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  double eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_gayberne_omp.cpp b/src/USER-OMP/pair_gayberne_omp.cpp
new file mode 100644
index 0000000000..98fa5428a2
--- /dev/null
+++ b/src/USER-OMP/pair_gayberne_omp.cpp
@@ -0,0 +1,231 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gayberne_omp.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "comm.h"
+#include "atom_vec_ellipsoid.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairGayBerneOMP::PairGayBerneOMP(LAMMPS *lmp) :
+  PairGayBerne(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairGayBerneOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
+  double fforce[3],ttor[3],rtor[3],r12[3];
+  double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double *iquat,*jquat;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const tor = thr->get_torque();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  const int * const ellipsoid = atom->ellipsoid;
+
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+
+  double fxtmp,fytmp,fztmp,t1tmp,t2tmp,t3tmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itype = type[i];
+    fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
+
+    if (form[itype][itype] == ELLIPSE_ELLIPSE) {
+      iquat = bonus[ellipsoid[i]].quat;
+      MathExtra::quat_to_mat_trans(iquat,a1);
+      MathExtra::diag_times3(well[itype],a1,temp);
+      MathExtra::transpose_times3(a1,temp,b1);
+      MathExtra::diag_times3(shape2[itype],a1,temp);
+      MathExtra::transpose_times3(a1,temp,g1);
+    }
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      // r12 = center to center vector
+
+      r12[0] = x[j][0]-x[i][0];
+      r12[1] = x[j][1]-x[i][1];
+      r12[2] = x[j][2]-x[i][2];
+      rsq = MathExtra::dot3(r12,r12);
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+
+        switch (form[itype][jtype]) {
+        case SPHERE_SPHERE:
+          r2inv = 1.0/rsq;
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= -r2inv;
+          if (EFLAG)
+            one_eng = r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
+              offset[itype][jtype];
+          fforce[0] = r12[0]*forcelj;
+          fforce[1] = r12[1]*forcelj;
+          fforce[2] = r12[2]*forcelj;
+          ttor[0] = ttor[1] = ttor[2] = 0.0;
+          rtor[0] = rtor[1] = rtor[2] = 0.0;
+          break;
+
+        case SPHERE_ELLIPSE:
+          jquat = bonus[ellipsoid[j]].quat;
+          MathExtra::quat_to_mat_trans(jquat,a2);
+          MathExtra::diag_times3(well[jtype],a2,temp);
+          MathExtra::transpose_times3(a2,temp,b2);
+          MathExtra::diag_times3(shape2[jtype],a2,temp);
+          MathExtra::transpose_times3(a2,temp,g2);
+          one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
+          ttor[0] = ttor[1] = ttor[2] = 0.0;
+          break;
+
+        case ELLIPSE_SPHERE:
+          one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
+          rtor[0] = rtor[1] = rtor[2] = 0.0;
+          break;
+
+        default:
+          jquat = bonus[ellipsoid[j]].quat;
+          MathExtra::quat_to_mat_trans(jquat,a2);
+          MathExtra::diag_times3(well[jtype],a2,temp);
+          MathExtra::transpose_times3(a2,temp,b2);
+          MathExtra::diag_times3(shape2[jtype],a2,temp);
+          MathExtra::transpose_times3(a2,temp,g2);
+          one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
+                                      fforce,ttor,rtor);
+          break;
+        }
+
+        fforce[0] *= factor_lj;
+        fforce[1] *= factor_lj;
+        fforce[2] *= factor_lj;
+        ttor[0] *= factor_lj;
+        ttor[1] *= factor_lj;
+        ttor[2] *= factor_lj;
+
+        fxtmp += fforce[0];
+        fytmp += fforce[1];
+        fztmp += fforce[2];
+        t1tmp += ttor[0];
+        t2tmp += ttor[1];
+        t3tmp += ttor[2];
+
+        if (NEWTON_PAIR || j < nlocal) {
+          rtor[0] *= factor_lj;
+          rtor[1] *= factor_lj;
+          rtor[2] *= factor_lj;
+          f[j][0] -= fforce[0];
+          f[j][1] -= fforce[1];
+          f[j][2] -= fforce[2];
+          tor[j][0] += rtor[0];
+          tor[j][1] += rtor[1];
+          tor[j][2] += rtor[2];
+        }
+
+        if (EFLAG) evdwl = factor_lj*one_eng;
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                     evdwl,0.0,fforce[0],fforce[1],fforce[2],
+                                     -r12[0],-r12[1],-r12[2],thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+    tor[i][0] += t1tmp;
+    tor[i][1] += t2tmp;
+    tor[i][2] += t3tmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGayBerneOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairGayBerne::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_gayberne_omp.h b/src/USER-OMP/pair_gayberne_omp.h
new file mode 100644
index 0000000000..0e7b6708ff
--- /dev/null
+++ b/src/USER-OMP/pair_gayberne_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gayberne/omp,PairGayBerneOMP)
+
+#else
+
+#ifndef LMP_PAIR_GAYBERNE_OMP_H
+#define LMP_PAIR_GAYBERNE_OMP_H
+
+#include "pair_gayberne.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairGayBerneOMP : public PairGayBerne, public ThrOMP {
+
+ public:
+  PairGayBerneOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_gran_hertz_history_omp.cpp b/src/USER-OMP/pair_gran_hertz_history_omp.cpp
new file mode 100644
index 0000000000..1b49c2dfb9
--- /dev/null
+++ b/src/USER-OMP/pair_gran_hertz_history_omp.cpp
@@ -0,0 +1,313 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gran_hertz_history_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "fix.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairGranHertzHistoryOMP::PairGranHertzHistoryOMP(LAMMPS *lmp) :
+  PairGranHertzHistory(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHertzHistoryOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  computeflag = 1;
+  const int shearupdate = (update->setupflag) ? 0 : 1;
+
+  // update body ptr and values for ghost atoms if using FixRigid masses
+
+  if (fix_rigid && neighbor->ago == 0) {
+    int tmp;
+    body = (int *) fix_rigid->extract("body",tmp);
+    mass_rigid = (double *) fix_rigid->extract("masstotal",tmp);
+    comm->forward_comm_pair(this);
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag)
+      if (shearupdate) eval<1,1>(ifrom, ito, thr);
+      else eval<1,0>(ifrom, ito, thr);
+    else
+      if (shearupdate) eval<0,1>(ifrom, ito, thr);
+      else eval<0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int SHEARUPDATE>
+void PairGranHertzHistoryOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz;
+  double radi,radj,radsum,rsq,r,rinv,rsqinv;
+  double vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
+  double wr1,wr2,wr3;
+  double vtr1,vtr2,vtr3,vrel;
+  double mi,mj,meff,damp,ccel,tor1,tor2,tor3;
+  double fn,fs,fs1,fs2,fs3;
+  double shrmag,rsht,polyhertz;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *touch,**firsttouch;
+  double *shear,*allshear,**firstshear;
+
+  const double * const * const x = atom->x;
+  const double * const * const v = atom->v;
+  const double * const * const omega = atom->omega;
+  const double * const radius = atom->radius;
+  const double * const rmass = atom->rmass;
+  const double * const mass = atom->mass;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const int * const type = atom->type;
+  const int * const mask = atom->mask;
+  const int nlocal = atom->nlocal;
+  double fxtmp,fytmp,fztmp;
+  double t1tmp,t2tmp,t3tmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  firsttouch = list->listgranhistory->firstneigh;
+  firstshear = list->listgranhistory->firstdouble;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+    touch = firsttouch[i];
+    allshear = firstshear[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radj = radius[j];
+      radsum = radi + radj;
+
+      if (rsq >= radsum*radsum) {
+
+        // unset non-touching neighbors
+
+        touch[jj] = 0;
+        shear = &allshear[3*jj];
+        shear[0] = 0.0;
+        shear[1] = 0.0;
+        shear[2] = 0.0;
+
+      } else {
+        r = sqrt(rsq);
+        rinv = 1.0/r;
+        rsqinv = 1.0/rsq;
+
+        // relative translational velocity
+
+        vr1 = v[i][0] - v[j][0];
+        vr2 = v[i][1] - v[j][1];
+        vr3 = v[i][2] - v[j][2];
+
+        // normal component
+
+        vnnr = vr1*delx + vr2*dely + vr3*delz;
+        vn1 = delx*vnnr * rsqinv;
+        vn2 = dely*vnnr * rsqinv;
+        vn3 = delz*vnnr * rsqinv;
+
+        // tangential component
+
+        vt1 = vr1 - vn1;
+        vt2 = vr2 - vn2;
+        vt3 = vr3 - vn3;
+
+        // relative rotational velocity
+
+        wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
+        wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
+        wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
+
+        // meff = effective mass of pair of particles
+        // if I or J part of rigid body, use body mass
+        // if I or J is frozen, meff is other particle
+
+        if (rmass) {
+          mi = rmass[i];
+          mj = rmass[j];
+        } else {
+          mi = mass[type[i]];
+          mj = mass[type[j]];
+        }
+        if (fix_rigid) {
+          if (body[i] >= 0) mi = mass_rigid[body[i]];
+          if (body[j] >= 0) mj = mass_rigid[body[j]];
+        }
+
+        meff = mi*mj / (mi+mj);
+        if (mask[i] & freeze_group_bit) meff = mj;
+        if (mask[j] & freeze_group_bit) meff = mi;
+
+        // normal force = Hertzian contact + normal velocity damping
+
+        damp = meff*gamman*vnnr*rsqinv;
+        ccel = kn*(radsum-r)*rinv - damp;
+        polyhertz = sqrt((radsum-r)*radi*radj / radsum);
+        ccel *= polyhertz;
+
+        // relative velocities
+
+        vtr1 = vt1 - (delz*wr2-dely*wr3);
+        vtr2 = vt2 - (delx*wr3-delz*wr1);
+        vtr3 = vt3 - (dely*wr1-delx*wr2);
+        vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
+        vrel = sqrt(vrel);
+
+        // shear history effects
+
+        touch[jj] = 1;
+        shear = &allshear[3*jj];
+
+        if (SHEARUPDATE) {
+          shear[0] += vtr1*dt;
+          shear[1] += vtr2*dt;
+          shear[2] += vtr3*dt;
+        }
+        shrmag = sqrt(shear[0]*shear[0] + shear[1]*shear[1] +
+                      shear[2]*shear[2]);
+
+        // rotate shear displacements
+
+        rsht = shear[0]*delx + shear[1]*dely + shear[2]*delz;
+        rsht *= rsqinv;
+        if (SHEARUPDATE) {
+          shear[0] -= rsht*delx;
+          shear[1] -= rsht*dely;
+          shear[2] -= rsht*delz;
+        }
+
+        // tangential forces = shear + tangential velocity damping
+
+        fs1 = -polyhertz * (kt*shear[0] + meff*gammat*vtr1);
+        fs2 = -polyhertz * (kt*shear[1] + meff*gammat*vtr2);
+        fs3 = -polyhertz * (kt*shear[2] + meff*gammat*vtr3);
+
+        // rescale frictional displacements and forces if needed
+
+        fs = sqrt(fs1*fs1 + fs2*fs2 + fs3*fs3);
+        fn = xmu * fabs(ccel*r);
+
+        if (fs > fn) {
+          if (shrmag != 0.0) {
+            const double fnfs = fn/fs;
+            const double mgkt = meff*gammat/kt;
+            shear[0] = fnfs * (shear[0] + mgkt*vtr1) - mgkt*vtr1;
+            shear[1] = fnfs * (shear[1] + mgkt*vtr2) - mgkt*vtr2;
+            shear[2] = fnfs * (shear[2] + mgkt*vtr3) - mgkt*vtr3;
+            fs1 *= fnfs;
+            fs2 *= fnfs;
+            fs3 *= fnfs;
+          } else fs1 = fs2 = fs3 = 0.0;
+        }
+
+        // forces & torques
+
+        fx = delx*ccel + fs1;
+        fy = dely*ccel + fs2;
+        fz = delz*ccel + fs3;
+        fxtmp  += fx;
+        fytmp  += fy;
+        fztmp  += fz;
+
+        tor1 = rinv * (dely*fs3 - delz*fs2);
+        tor2 = rinv * (delz*fs1 - delx*fs3);
+        tor3 = rinv * (delx*fs2 - dely*fs1);
+        t1tmp -= radi*tor1;
+        t2tmp -= radi*tor2;
+        t3tmp -= radi*tor3;
+
+        if (j < nlocal) {
+          f[j][0] -= fx;
+          f[j][1] -= fy;
+          f[j][2] -= fz;
+          torque[j][0] -= radj*tor1;
+          torque[j][1] -= radj*tor2;
+          torque[j][2] -= radj*tor3;
+        }
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,/* newton_pair */ 0,
+                                     0.0,0.0,fx,fy,fz,delx,dely,delz,thr);
+
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+    torque[i][0] += t1tmp;
+    torque[i][1] += t2tmp;
+    torque[i][2] += t3tmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGranHertzHistoryOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairGranHertzHistory::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_gran_hertz_history_omp.h b/src/USER-OMP/pair_gran_hertz_history_omp.h
new file mode 100644
index 0000000000..1a8d819920
--- /dev/null
+++ b/src/USER-OMP/pair_gran_hertz_history_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gran/hertz/history/omp,PairGranHertzHistoryOMP)
+
+#else
+
+#ifndef LMP_PAIR_GRAN_HERTZ_HISTORY_OMP_H
+#define LMP_PAIR_GRAN_HERTZ_HISTORY_OMP_H
+
+#include "pair_gran_hertz_history.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairGranHertzHistoryOMP : public PairGranHertzHistory, public ThrOMP {
+
+ public:
+  PairGranHertzHistoryOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int SHEARUPDATE>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_gran_hooke_history_omp.cpp b/src/USER-OMP/pair_gran_hooke_history_omp.cpp
new file mode 100644
index 0000000000..6362228b41
--- /dev/null
+++ b/src/USER-OMP/pair_gran_hooke_history_omp.cpp
@@ -0,0 +1,317 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gran_hooke_history_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "fix.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+
+#include "string.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairGranHookeHistoryOMP::PairGranHookeHistoryOMP(LAMMPS *lmp) :
+  PairGranHookeHistory(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  // trigger use of OpenMP version of FixShearHistory
+  suffix = new char[4];
+  memcpy(suffix,"omp",4);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeHistoryOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  computeflag = 1;
+  const int shearupdate = (update->setupflag) ? 0 : 1;
+
+  // update body ptr and values for ghost atoms if using FixRigid masses
+
+  if (fix_rigid && neighbor->ago == 0) {
+    int tmp;
+    body = (int *) fix_rigid->extract("body",tmp);
+    mass_rigid = (double *) fix_rigid->extract("masstotal",tmp);
+    comm->forward_comm_pair(this);
+  }
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag)
+      if (shearupdate) eval<1,1>(ifrom, ito, thr);
+      else eval<1,0>(ifrom, ito, thr);
+    else
+      if (shearupdate) eval<0,1>(ifrom, ito, thr);
+      else eval<0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int SHEARUPDATE>
+void PairGranHookeHistoryOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz;
+  double myshear[3];
+  double radi,radj,radsum,rsq,r,rinv,rsqinv;
+  double vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
+  double wr1,wr2,wr3;
+  double vtr1,vtr2,vtr3,vrel;
+  double mi,mj,meff,damp,ccel,tor1,tor2,tor3;
+  double fn,fs,fs1,fs2,fs3;
+  double shrmag,rsht;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *touch,**firsttouch;
+  double *allshear,**firstshear;
+
+  const double * const * const x = atom->x;
+  const double * const * const v = atom->v;
+  const double * const * const omega = atom->omega;
+  const double * const radius = atom->radius;
+  const double * const rmass = atom->rmass;
+  const double * const mass = atom->mass;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const int * const type = atom->type;
+  const int * const mask = atom->mask;
+  const int nlocal = atom->nlocal;
+  double fxtmp,fytmp,fztmp;
+  double t1tmp,t2tmp,t3tmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  firsttouch = listgranhistory->firstneigh;
+  firstshear = listgranhistory->firstdouble;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+    touch = firsttouch[i];
+    allshear = firstshear[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radj = radius[j];
+      radsum = radi + radj;
+
+      if (rsq >= radsum*radsum) {
+
+        // unset non-touching neighbors
+
+        touch[jj] = 0;
+        myshear[0] = 0.0;
+        myshear[1] = 0.0;
+        myshear[2] = 0.0;
+
+      } else {
+        r = sqrt(rsq);
+        rinv = 1.0/r;
+        rsqinv = 1.0/rsq;
+
+        // relative translational velocity
+
+        vr1 = v[i][0] - v[j][0];
+        vr2 = v[i][1] - v[j][1];
+        vr3 = v[i][2] - v[j][2];
+
+        // normal component
+
+        vnnr = vr1*delx + vr2*dely + vr3*delz;
+        vn1 = delx*vnnr * rsqinv;
+        vn2 = dely*vnnr * rsqinv;
+        vn3 = delz*vnnr * rsqinv;
+
+        // tangential component
+
+        vt1 = vr1 - vn1;
+        vt2 = vr2 - vn2;
+        vt3 = vr3 - vn3;
+
+        // relative rotational velocity
+
+        wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
+        wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
+        wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
+
+        // meff = effective mass of pair of particles
+        // if I or J part of rigid body, use body mass
+        // if I or J is frozen, meff is other particle
+
+        if (rmass) {
+          mi = rmass[i];
+          mj = rmass[j];
+        } else {
+          mi = mass[type[i]];
+          mj = mass[type[j]];
+        }
+        if (fix_rigid) {
+          if (body[i] >= 0) mi = mass_rigid[body[i]];
+          if (body[j] >= 0) mj = mass_rigid[body[j]];
+        }
+
+        meff = mi*mj / (mi+mj);
+        if (mask[i] & freeze_group_bit) meff = mj;
+        if (mask[j] & freeze_group_bit) meff = mi;
+
+        // normal forces = Hookian contact + normal velocity damping
+
+        damp = meff*gamman*vnnr*rsqinv;
+        ccel = kn*(radsum-r)*rinv - damp;
+
+        // relative velocities
+
+        vtr1 = vt1 - (delz*wr2-dely*wr3);
+        vtr2 = vt2 - (delx*wr3-delz*wr1);
+        vtr3 = vt3 - (dely*wr1-delx*wr2);
+        vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
+        vrel = sqrt(vrel);
+
+        // shear history effects
+
+        touch[jj] = 1;
+        memcpy(myshear,allshear + 3*jj, 3*sizeof(double));
+
+        if (SHEARUPDATE) {
+          myshear[0] += vtr1*dt;
+          myshear[1] += vtr2*dt;
+          myshear[2] += vtr3*dt;
+        }
+        shrmag = sqrt(myshear[0]*myshear[0] + myshear[1]*myshear[1] +
+                      myshear[2]*myshear[2]);
+
+        // rotate shear displacements
+
+        rsht = myshear[0]*delx + myshear[1]*dely + myshear[2]*delz;
+        rsht *= rsqinv;
+        if (SHEARUPDATE) {
+          myshear[0] -= rsht*delx;
+          myshear[1] -= rsht*dely;
+          myshear[2] -= rsht*delz;
+        }
+
+        // tangential forces = shear + tangential velocity damping
+
+        fs1 = - (kt*myshear[0] + meff*gammat*vtr1);
+        fs2 = - (kt*myshear[1] + meff*gammat*vtr2);
+        fs3 = - (kt*myshear[2] + meff*gammat*vtr3);
+
+        // rescale frictional displacements and forces if needed
+
+        fs = sqrt(fs1*fs1 + fs2*fs2 + fs3*fs3);
+        fn = xmu * fabs(ccel*r);
+
+        if (fs > fn) {
+          if (shrmag != 0.0) {
+            const double fnfs = fn/fs;
+            const double mgkt = meff*gammat/kt;
+            myshear[0] = fnfs * (myshear[0] + mgkt*vtr1) - mgkt*vtr1;
+            myshear[1] = fnfs * (myshear[1] + mgkt*vtr2) - mgkt*vtr2;
+            myshear[2] = fnfs * (myshear[2] + mgkt*vtr3) - mgkt*vtr3;
+            fs1 *= fnfs;
+            fs2 *= fnfs;
+            fs3 *= fnfs;
+          } else fs1 = fs2 = fs3 = 0.0;
+        }
+
+        // forces & torques
+
+        fx = delx*ccel + fs1;
+        fy = dely*ccel + fs2;
+        fz = delz*ccel + fs3;
+        fxtmp  += fx;
+        fytmp  += fy;
+        fztmp  += fz;
+
+        tor1 = rinv * (dely*fs3 - delz*fs2);
+        tor2 = rinv * (delz*fs1 - delx*fs3);
+        tor3 = rinv * (delx*fs2 - dely*fs1);
+        t1tmp -= radi*tor1;
+        t2tmp -= radi*tor2;
+        t3tmp -= radi*tor3;
+
+        if (j < nlocal) {
+          f[j][0] -= fx;
+          f[j][1] -= fy;
+          f[j][2] -= fz;
+          torque[j][0] -= radj*tor1;
+          torque[j][1] -= radj*tor2;
+          torque[j][2] -= radj*tor3;
+        }
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,/* newton_pair */ 0,
+                                     0.0,0.0,fx,fy,fz,delx,dely,delz,thr);
+
+      }
+      memcpy(allshear + 3*jj, myshear, 3*sizeof(double));
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+    torque[i][0] += t1tmp;
+    torque[i][1] += t2tmp;
+    torque[i][2] += t3tmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGranHookeHistoryOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairGranHookeHistory::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_gran_hooke_history_omp.h b/src/USER-OMP/pair_gran_hooke_history_omp.h
new file mode 100644
index 0000000000..9cdebf7104
--- /dev/null
+++ b/src/USER-OMP/pair_gran_hooke_history_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gran/hooke/history/omp,PairGranHookeHistoryOMP)
+
+#else
+
+#ifndef LMP_PAIR_GRAN_HOOKE_HISTORY_OMP_H
+#define LMP_PAIR_GRAN_HOOKE_HISTORY_OMP_H
+
+#include "pair_gran_hooke_history.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairGranHookeHistoryOMP : public PairGranHookeHistory, public ThrOMP {
+
+ public:
+  PairGranHookeHistoryOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int SHEARUPDATE>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_gran_hooke_omp.cpp b/src/USER-OMP/pair_gran_hooke_omp.cpp
new file mode 100644
index 0000000000..a1a4b1fe03
--- /dev/null
+++ b/src/USER-OMP/pair_gran_hooke_omp.cpp
@@ -0,0 +1,256 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gran_hooke_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "fix.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairGranHookeOMP::PairGranHookeOMP(LAMMPS *lmp) :
+  PairGranHooke(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // update body ptr and values for ghost atoms if using FixRigid masses
+
+  if (fix_rigid && neighbor->ago == 0) {
+    int tmp;
+    body = (int *) fix_rigid->extract("body",tmp);
+    mass_rigid = (double *) fix_rigid->extract("masstotal",tmp);
+    comm->forward_comm_pair(this);
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag)
+      if (force->newton_pair) eval<1,1>(ifrom, ito, thr);
+      else eval<1,0>(ifrom, ito, thr);
+    else
+      if (force->newton_pair) eval<0,1>(ifrom, ito, thr);
+      else eval<0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int NEWTON_PAIR>
+void PairGranHookeOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz;
+  double radi,radj,radsum,rsq,r,rinv,rsqinv;
+  double vr1,vr2,vr3,vnnr,vn1,vn2,vn3,vt1,vt2,vt3;
+  double wr1,wr2,wr3;
+  double vtr1,vtr2,vtr3,vrel;
+  double mi,mj,meff,damp,ccel,tor1,tor2,tor3;
+  double fn,fs,ft,fs1,fs2,fs3;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  const double * const * const x = atom->x;
+  const double * const * const v = atom->v;
+  const double * const * const omega = atom->omega;
+  const double * const radius = atom->radius;
+  const double * const rmass = atom->rmass;
+  const double * const mass = atom->mass;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const int * const type = atom->type;
+  const int * const mask = atom->mask;
+  const int nlocal = atom->nlocal;
+  double fxtmp,fytmp,fztmp;
+  double t1tmp,t2tmp,t3tmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radj = radius[j];
+      radsum = radi + radj;
+
+      if (rsq < radsum*radsum) {
+        r = sqrt(rsq);
+        rinv = 1.0/r;
+        rsqinv = 1.0/rsq;
+
+        // relative translational velocity
+
+        vr1 = v[i][0] - v[j][0];
+        vr2 = v[i][1] - v[j][1];
+        vr3 = v[i][2] - v[j][2];
+
+        // normal component
+
+        vnnr = vr1*delx + vr2*dely + vr3*delz;
+        vn1 = delx*vnnr * rsqinv;
+        vn2 = dely*vnnr * rsqinv;
+        vn3 = delz*vnnr * rsqinv;
+
+        // tangential component
+
+        vt1 = vr1 - vn1;
+        vt2 = vr2 - vn2;
+        vt3 = vr3 - vn3;
+
+        // relative rotational velocity
+
+        wr1 = (radi*omega[i][0] + radj*omega[j][0]) * rinv;
+        wr2 = (radi*omega[i][1] + radj*omega[j][1]) * rinv;
+        wr3 = (radi*omega[i][2] + radj*omega[j][2]) * rinv;
+
+        // meff = effective mass of pair of particles
+        // if I or J part of rigid body, use body mass
+        // if I or J is frozen, meff is other particle
+
+        if (rmass) {
+          mi = rmass[i];
+          mj = rmass[j];
+        } else {
+          mi = mass[type[i]];
+          mj = mass[type[j]];
+        }
+        if (fix_rigid) {
+          if (body[i] >= 0) mi = mass_rigid[body[i]];
+          if (body[j] >= 0) mj = mass_rigid[body[j]];
+        }
+
+        meff = mi*mj / (mi+mj);
+        if (mask[i] & freeze_group_bit) meff = mj;
+        if (mask[j] & freeze_group_bit) meff = mi;
+
+        // normal forces = Hookian contact + normal velocity damping
+
+        damp = meff*gamman*vnnr*rsqinv;
+        ccel = kn*(radsum-r)*rinv - damp;
+
+        // relative velocities
+
+        vtr1 = vt1 - (delz*wr2-dely*wr3);
+        vtr2 = vt2 - (delx*wr3-delz*wr1);
+        vtr3 = vt3 - (dely*wr1-delx*wr2);
+        vrel = vtr1*vtr1 + vtr2*vtr2 + vtr3*vtr3;
+        vrel = sqrt(vrel);
+
+        // force normalization
+
+        fn = xmu * fabs(ccel*r);
+        fs = meff*gammat*vrel;
+        if (vrel != 0.0) ft = MIN(fn,fs) / vrel;
+        else ft = 0.0;
+
+        // tangential force due to tangential velocity damping
+
+        fs1 = -ft*vtr1;
+        fs2 = -ft*vtr2;
+        fs3 = -ft*vtr3;
+
+        // forces & torques
+
+        fx = delx*ccel + fs1;
+        fy = dely*ccel + fs2;
+        fz = delz*ccel + fs3;
+        fxtmp  += fx;
+        fytmp  += fy;
+        fztmp  += fz;
+
+        tor1 = rinv * (dely*fs3 - delz*fs2);
+        tor2 = rinv * (delz*fs1 - delx*fs3);
+        tor3 = rinv * (delx*fs2 - dely*fs1);
+        t1tmp -= radi*tor1;
+        t2tmp -= radi*tor2;
+        t3tmp -= radi*tor3;
+
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= fx;
+          f[j][1] -= fy;
+          f[j][2] -= fz;
+          torque[j][0] -= radj*tor1;
+          torque[j][1] -= radj*tor2;
+          torque[j][2] -= radj*tor3;
+        }
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                     0.0,0.0,fx,fy,fz,delx,dely,delz,thr);
+
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+    torque[i][0] += t1tmp;
+    torque[i][1] += t2tmp;
+    torque[i][2] += t3tmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairGranHookeOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairGranHooke::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_gran_hooke_omp.h b/src/USER-OMP/pair_gran_hooke_omp.h
new file mode 100644
index 0000000000..81f831fcea
--- /dev/null
+++ b/src/USER-OMP/pair_gran_hooke_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gran/hooke/omp,PairGranHookeOMP)
+
+#else
+
+#ifndef LMP_PAIR_GRAN_HOOKE_OMP_H
+#define LMP_PAIR_GRAN_HOOKE_OMP_H
+
+#include "pair_gran_hooke.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairGranHookeOMP : public PairGranHooke, public ThrOMP {
+
+ public:
+  PairGranHookeOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_hbond_dreiding_lj_omp.cpp b/src/USER-OMP/pair_hbond_dreiding_lj_omp.cpp
new file mode 100644
index 0000000000..8e4b8a7a90
--- /dev/null
+++ b/src/USER-OMP/pair_hbond_dreiding_lj_omp.cpp
@@ -0,0 +1,297 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_hbond_dreiding_lj_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+PairHbondDreidingLJOMP::PairHbondDreidingLJOMP(LAMMPS *lmp) :
+  PairHbondDreidingLJ(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  hbcount_thr = hbeng_thr = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairHbondDreidingLJOMP::~PairHbondDreidingLJOMP()
+{
+  if (hbcount_thr) {
+    delete[] hbcount_thr;
+    delete[] hbeng_thr;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHbondDreidingLJOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  if (!hbcount_thr) {
+    hbcount_thr = new double[nthreads];
+    hbeng_thr = new double[nthreads];
+  }
+
+  for (int i=0; i < nthreads; ++i) {
+    hbcount_thr[i] = 0.0;
+    hbeng_thr[i] = 0.0;
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+
+  // reduce per thread hbond data
+  if (eflag_global) {
+    pvector[0] = 0.0;
+    pvector[1] = 0.0;
+    for (int i=0; i < nthreads; ++i) {
+      pvector[0] += hbcount_thr[i];
+      pvector[1] += hbeng_thr[i];
+    }
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairHbondDreidingLJOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,m,ii,jj,kk,jnum,itype,jtype,ktype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq,rsq1,rsq2,r1,r2;
+  double factor_hb,force_angle,force_kernel,evdwl,eng_lj;
+  double c,s,a,b,ac,a11,a12,a22,vx1,vx2,vy1,vy2,vz1,vz2;
+  double fi[3],fj[3],delr1[3],delr2[3];
+  double r2inv,r10inv;
+  double switch1,switch2;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  Param *pm;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const double * const special_lj = force->special_lj;
+  const int * const * const nspecial = atom->nspecial;
+  const int * const * const special = atom->special;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+
+  // ii = loop over donors
+  // jj = loop over acceptors
+  // kk = loop over hydrogens bonded to donor
+
+  int hbcount = 0;
+  double hbeng = 0.0;
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itype = type[i];
+    if (!donor[itype]) continue;
+
+    const int * const klist = special[i];
+    const int knum = nspecial[i][0];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_hb = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      jtype = type[j];
+      if (!acceptor[jtype]) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      for (kk = 0; kk < knum; kk++) {
+        k = atom->map(klist[kk]);
+        if (k < 0) continue;
+        ktype = type[k];
+        m = type2param[itype][jtype][ktype];
+        if (m < 0) continue;
+        pm = &params[m];
+
+        if (rsq < pm->cut_outersq) {
+          delr1[0] = xtmp - x[k][0];
+          delr1[1] = ytmp - x[k][1];
+          delr1[2] = ztmp - x[k][2];
+          domain->minimum_image(delr1);
+          rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
+          r1 = sqrt(rsq1);
+
+          delr2[0] = x[j][0] - x[k][0];
+          delr2[1] = x[j][1] - x[k][1];
+          delr2[2] = x[j][2] - x[k][2];
+          domain->minimum_image(delr2);
+          rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+          r2 = sqrt(rsq2);
+
+          // angle (cos and sin)
+
+          c = delr1[0]*delr2[0] + delr1[1]*delr2[1] + delr1[2]*delr2[2];
+          c /= r1*r2;
+          if (c > 1.0) c = 1.0;
+          if (c < -1.0) c = -1.0;
+          ac = acos(c);
+
+          if (ac > pm->cut_angle && ac < (2.0*MY_PI - pm->cut_angle)) {
+            s = sqrt(1.0 - c*c);
+            if (s < SMALL) s = SMALL;
+
+            // LJ-specific kernel
+
+            r2inv = 1.0/rsq;
+            r10inv = r2inv*r2inv*r2inv*r2inv*r2inv;
+            force_kernel = r10inv*(pm->lj1*r2inv - pm->lj2)*r2inv *
+              pow(c,(double)pm->ap);
+            force_angle = pm->ap * r10inv*(pm->lj3*r2inv - pm->lj4) *
+              pow(c,pm->ap-1.0)*s;
+
+            eng_lj = r10inv*(pm->lj3*r2inv - pm->lj4);
+            if (rsq > pm->cut_innersq) {
+              switch1 = (pm->cut_outersq-rsq) * (pm->cut_outersq-rsq) *
+                        (pm->cut_outersq + 2.0*rsq - 3.0*pm->cut_innersq) /
+                        pm->denom_vdw;
+              switch2 = 12.0*rsq * (pm->cut_outersq-rsq) *
+                        (rsq-pm->cut_innersq) / pm->denom_vdw;
+              force_kernel = force_kernel*switch1 + eng_lj*switch2;
+              eng_lj *= switch1;
+            }
+
+            if (EFLAG) {
+              evdwl = eng_lj * pow(c,(double)pm->ap);
+              evdwl *= factor_hb;
+            }
+
+            a = factor_hb*force_angle/s;
+            b = factor_hb*force_kernel;
+
+            a11 = a*c / rsq1;
+            a12 = -a / (r1*r2);
+            a22 = a*c / rsq2;
+
+            vx1 = a11*delr1[0] + a12*delr2[0];
+            vx2 = a22*delr2[0] + a12*delr1[0];
+            vy1 = a11*delr1[1] + a12*delr2[1];
+            vy2 = a22*delr2[1] + a12*delr1[1];
+            vz1 = a11*delr1[2] + a12*delr2[2];
+            vz2 = a22*delr2[2] + a12*delr1[2];
+
+            fi[0] = vx1 + b*delx;
+            fi[1] = vy1 + b*dely;
+            fi[2] = vz1 + b*delz;
+            fj[0] = vx2 - b*delx;
+            fj[1] = vy2 - b*dely;
+            fj[2] = vz2 - b*delz;
+
+            fxtmp += fi[0];
+            fytmp += fi[1];
+            fztmp += fi[2];
+
+            f[j][0] += fj[0];
+            f[j][1] += fj[1];
+            f[j][2] += fj[2];
+
+            f[k][0] -= vx1 + vx2;
+            f[k][1] -= vy1 + vy2;
+            f[k][2] -= vz1 + vz2;
+
+            // KIJ instead of IJK b/c delr1/delr2 are both with respect to k
+
+            if (EVFLAG) ev_tally3_thr(this,k,i,j,evdwl,0.0,fi,fj,delr1,delr2,thr);
+            if (EFLAG) {
+              hbcount++;
+              hbeng += evdwl;
+            }
+          }
+        }
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+  const int tid = thr->get_tid();
+  hbcount_thr[tid] = static_cast<double>(hbcount);
+  hbeng_thr[tid] = hbeng;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairHbondDreidingLJOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += comm->nthreads * 2 * sizeof(double);
+  bytes += PairHbondDreidingLJ::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_hbond_dreiding_lj_omp.h b/src/USER-OMP/pair_hbond_dreiding_lj_omp.h
new file mode 100644
index 0000000000..dc5108b3d7
--- /dev/null
+++ b/src/USER-OMP/pair_hbond_dreiding_lj_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(hbond/dreiding/lj/omp,PairHbondDreidingLJOMP)
+
+#else
+
+#ifndef LMP_PAIR_HBOND_DREIDING_LJ_OMP_H
+#define LMP_PAIR_HBOND_DREIDING_LJ_OMP_H
+
+#include "pair_hbond_dreiding_lj.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairHbondDreidingLJOMP : public PairHbondDreidingLJ, public ThrOMP {
+
+ public:
+  PairHbondDreidingLJOMP(class LAMMPS *);
+  virtual ~PairHbondDreidingLJOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  double *hbcount_thr, *hbeng_thr;
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_hbond_dreiding_morse_omp.cpp b/src/USER-OMP/pair_hbond_dreiding_morse_omp.cpp
new file mode 100644
index 0000000000..3dccd98dbf
--- /dev/null
+++ b/src/USER-OMP/pair_hbond_dreiding_morse_omp.cpp
@@ -0,0 +1,295 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_hbond_dreiding_morse_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 0.001
+
+/* ---------------------------------------------------------------------- */
+
+PairHbondDreidingMorseOMP::PairHbondDreidingMorseOMP(LAMMPS *lmp) :
+  PairHbondDreidingMorse(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  hbcount_thr = hbeng_thr = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairHbondDreidingMorseOMP::~PairHbondDreidingMorseOMP()
+{
+  if (hbcount_thr) {
+    delete[] hbcount_thr;
+    delete[] hbeng_thr;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairHbondDreidingMorseOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  if (!hbcount_thr) {
+    hbcount_thr = new double[nthreads];
+    hbeng_thr = new double[nthreads];
+  }
+
+  for (int i=0; i < nthreads; ++i) {
+    hbcount_thr[i] = 0.0;
+    hbeng_thr[i] = 0.0;
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+
+  // reduce per thread hbond data
+  if (eflag_global) {
+    pvector[0] = 0.0;
+    pvector[1] = 0.0;
+    for (int i=0; i < nthreads; ++i) {
+      pvector[0] += hbcount_thr[i];
+      pvector[1] += hbeng_thr[i];
+    }
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairHbondDreidingMorseOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,m,ii,jj,kk,jnum,itype,jtype,ktype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,rsq,rsq1,rsq2,r1,r2;
+  double factor_hb,force_angle,force_kernel,evdwl;
+  double c,s,a,b,ac,a11,a12,a22,vx1,vx2,vy1,vy2,vz1,vz2;
+  double fi[3],fj[3],delr1[3],delr2[3];
+  double r,dr,dexp,eng_morse,switch1,switch2;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  Param *pm;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const double * const special_lj = force->special_lj;
+  const int * const * const nspecial = atom->nspecial;
+  const int * const * const special = atom->special;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+
+  // ii = loop over donors
+  // jj = loop over acceptors
+  // kk = loop over hydrogens bonded to donor
+
+  int hbcount = 0;
+  double hbeng = 0.0;
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itype = type[i];
+    if (!donor[itype]) continue;
+
+    const int * const klist = special[i];
+    const int knum = nspecial[i][0];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_hb = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      jtype = type[j];
+      if (!acceptor[jtype]) continue;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      for (kk = 0; kk < knum; kk++) {
+        k = atom->map(klist[kk]);
+        if (k < 0) continue;
+        ktype = type[k];
+        m = type2param[itype][jtype][ktype];
+        if (m < 0) continue;
+        pm = &params[m];
+
+        if (rsq < pm->cut_outersq) {
+          delr1[0] = xtmp - x[k][0];
+          delr1[1] = ytmp - x[k][1];
+          delr1[2] = ztmp - x[k][2];
+          domain->minimum_image(delr1);
+          rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
+          r1 = sqrt(rsq1);
+
+          delr2[0] = x[j][0] - x[k][0];
+          delr2[1] = x[j][1] - x[k][1];
+          delr2[2] = x[j][2] - x[k][2];
+          domain->minimum_image(delr2);
+          rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+          r2 = sqrt(rsq2);
+
+          // angle (cos and sin)
+
+          c = delr1[0]*delr2[0] + delr1[1]*delr2[1] + delr1[2]*delr2[2];
+          c /= r1*r2;
+          if (c > 1.0) c = 1.0;
+          if (c < -1.0) c = -1.0;
+          ac = acos(c);
+
+          if (ac > pm->cut_angle && ac < (2.0*MY_PI - pm->cut_angle)) {
+            s = sqrt(1.0 - c*c);
+            if (s < SMALL) s = SMALL;
+
+            // Morse-specific kernel
+
+            r = sqrt(rsq);
+            dr = r - pm->r0;
+            dexp = exp(-pm->alpha * dr);
+            force_kernel = pm->morse1*(dexp*dexp - dexp)/r * pow(c,(double)pm->ap);
+            force_angle = pm->ap * eng_morse * pow(c,(double)pm->ap-1.0)*s;
+
+            eng_morse = pm->d0 * (dexp*dexp - 2.0*dexp);
+            if (rsq > pm->cut_innersq) {
+              switch1 = (pm->cut_outersq-rsq) * (pm->cut_outersq-rsq) *
+                        (pm->cut_outersq + 2.0*rsq - 3.0*pm->cut_innersq) /
+                        pm->denom_vdw;
+              switch2 = 12.0*rsq * (pm->cut_outersq-rsq) *
+                        (rsq-pm->cut_innersq) / pm->denom_vdw;
+              force_kernel = force_kernel*switch1 + eng_morse*switch2;
+              eng_morse *= switch1;
+            }
+
+            if (EFLAG) {
+              evdwl = eng_morse * pow(c,(double)params[m].ap);
+              evdwl *= factor_hb;
+            }
+
+            a = factor_hb*force_angle/s;
+            b = factor_hb*force_kernel;
+
+            a11 = a*c / rsq1;
+            a12 = -a / (r1*r2);
+            a22 = a*c / rsq2;
+
+            vx1 = a11*delr1[0] + a12*delr2[0];
+            vx2 = a22*delr2[0] + a12*delr1[0];
+            vy1 = a11*delr1[1] + a12*delr2[1];
+            vy2 = a22*delr2[1] + a12*delr1[1];
+            vz1 = a11*delr1[2] + a12*delr2[2];
+            vz2 = a22*delr2[2] + a12*delr1[2];
+
+            fi[0] = vx1 + b*delx;
+            fi[1] = vy1 + b*dely;
+            fi[2] = vz1 + b*delz;
+            fj[0] = vx2 - b*delx;
+            fj[1] = vy2 - b*dely;
+            fj[2] = vz2 - b*delz;
+
+            fxtmp += fi[0];
+            fytmp += fi[1];
+            fztmp += fi[2];
+
+            f[j][0] += fj[0];
+            f[j][1] += fj[1];
+            f[j][2] += fj[2];
+
+            f[k][0] -= vx1 + vx2;
+            f[k][1] -= vy1 + vy2;
+            f[k][2] -= vz1 + vz2;
+
+            // KIJ instead of IJK b/c delr1/delr2 are both with respect to k
+
+            if (EVFLAG) ev_tally3_thr(this,k,i,j,evdwl,0.0,fi,fj,delr1,delr2,thr);
+            if (EFLAG) {
+              hbcount++;
+              hbeng += evdwl;
+            }
+          }
+        }
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+  const int tid = thr->get_tid();
+  hbcount_thr[tid] = static_cast<double>(hbcount);
+  hbeng_thr[tid] = hbeng;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairHbondDreidingMorseOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += comm->nthreads * 2 * sizeof(double);
+  bytes += PairHbondDreidingMorse::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_hbond_dreiding_morse_omp.h b/src/USER-OMP/pair_hbond_dreiding_morse_omp.h
new file mode 100644
index 0000000000..5a8aaf2c5a
--- /dev/null
+++ b/src/USER-OMP/pair_hbond_dreiding_morse_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(hbond/dreiding/morse/omp,PairHbondDreidingMorseOMP)
+
+#else
+
+#ifndef LMP_PAIR_HBOND_DREIDING_MORSE_OMP_H
+#define LMP_PAIR_HBOND_DREIDING_MORSE_OMP_H
+
+#include "pair_hbond_dreiding_morse.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairHbondDreidingMorseOMP : public PairHbondDreidingMorse, public ThrOMP {
+
+ public:
+  PairHbondDreidingMorseOMP(class LAMMPS *);
+  virtual ~PairHbondDreidingMorseOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  double *hbcount_thr, *hbeng_thr;
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_line_lj_omp.cpp b/src/USER-OMP/pair_line_lj_omp.cpp
new file mode 100644
index 0000000000..27c630166e
--- /dev/null
+++ b/src/USER-OMP/pair_line_lj_omp.cpp
@@ -0,0 +1,341 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_line_lj_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include <string.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLineLJOMP::PairLineLJOMP(LAMMPS *lmp) :
+  PairLineLJ(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLineLJOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+  const int * const line = atom->line;
+  const int * const type = atom->type;
+
+  // grow discrete list if necessary and initialize
+
+  if (nall > nmax) {
+    nmax = nall;
+    memory->destroy(dnum);
+    memory->destroy(dfirst);
+    memory->create(dnum,nall,"pair:dnum");
+    memory->create(dfirst,nall,"pair:dfirst");
+  }
+  memset(dnum,0,nall*sizeof(int));
+  ndiscrete = 0;
+
+  // need to discretize the system ahead of time
+  // until we find a good way to multi-thread it.
+  for (int i = 0; i < nall; ++i)
+    if (line[i] >= 0)
+      if (dnum[i] == 0)
+        discretize(i,sigma[type[i]][type[i]]);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLineLJOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  int ni,nj,npi,npj,ifirst,jfirst;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,term1,term2,sig,sig3,forcelj;
+  double xi[2],xj[2],fi[2],fj[2],dxi,dxj,dyi,dyj,ti,tj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const int * const line = atom->line;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq >= cutsq[itype][jtype]) continue;
+
+      // line/line interactions = NxN particles
+
+      evdwl = 0.0;
+      if (line[i] >= 0 && line[j] >= 0) {
+        npi = dnum[i];
+        ifirst = dfirst[i];
+        npj = dnum[j];
+        jfirst = dfirst[j];
+
+        fi[0] = fi[1] = fj[0] = fj[1] = ti = tj = 0.0;
+
+        for (ni = 0; ni < npi; ni++) {
+          dxi = discrete[ifirst+ni].dx;
+          dyi = discrete[ifirst+ni].dy;
+
+          for (nj = 0; nj < npj; nj++) {
+            dxj = discrete[jfirst+nj].dx;
+            dyj = discrete[jfirst+nj].dy;
+
+            xi[0] = x[i][0] + dxi;
+            xi[1] = x[i][1] + dyi;
+            xj[0] = x[j][0] + dxj;
+            xj[1] = x[j][1] + dyj;
+
+            delx = xi[0] - xj[0];
+            dely = xi[1] - xj[1];
+            rsq = delx*delx + dely*dely;
+
+            sig = 0.5 * (discrete[ifirst+ni].sigma+discrete[jfirst+nj].sigma);
+            sig3 = sig*sig*sig;
+            term2 = 24.0*epsilon[itype][jtype] * sig3*sig3;
+            term1 = 2.0 * term2 * sig3*sig3;
+            r2inv = 1.0/rsq;
+            r6inv = r2inv*r2inv*r2inv;
+            forcelj = r6inv * (term1*r6inv - term2);
+            fpair = forcelj*r2inv;
+
+            if (EFLAG) evdwl += r6inv*(term1/12.0*r6inv-term2/6.0);
+
+            fi[0] += delx*fpair;
+            fi[1] += dely*fpair;
+            ti += fpair*(dxi*dely - dyi*delx);
+
+            if (NEWTON_PAIR || j < nlocal) {
+              fj[0] -= delx*fpair;
+              fj[1] -= dely*fpair;
+              tj += fpair*(dxj*dely - dyj*delx);
+            }
+          }
+        }
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[j][0] += fj[0];
+        f[j][1] += fj[1];
+        torque[i][2] += ti;
+        torque[j][2] += tj;
+
+      // line/particle interaction = Nx1 particles
+      // convert line into Np particles based on sigma and line length
+
+      } else if (line[i] >= 0) {
+        npi = dnum[i];
+        ifirst = dfirst[i];
+
+        fi[0] = fi[1] = fj[0] = fj[1] = ti = tj = 0.0;
+
+        for (ni = 0; ni < npi; ni++) {
+          dxi = discrete[ifirst+ni].dx;
+          dyi = discrete[ifirst+ni].dy;
+
+          xi[0] = x[i][0] + dxi;
+          xi[1] = x[i][1] + dyi;
+          xj[0] = x[j][0];
+          xj[1] = x[j][1];
+
+          delx = xi[0] - xj[0];
+          dely = xi[1] - xj[1];
+          rsq = delx*delx + dely*dely;
+
+          sig = 0.5 * (discrete[ifirst+ni].sigma+sigma[jtype][jtype]);
+          sig3 = sig*sig*sig;
+          term2 = 24.0*epsilon[itype][jtype] * sig3*sig3;
+          term1 = 2.0 * term2 * sig3*sig3;
+          r2inv = 1.0/rsq;
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (term1*r6inv - term2);
+          fpair = forcelj*r2inv;
+
+          if (EFLAG) evdwl += r6inv*(term1/12.0*r6inv-term2/6.0);
+
+          fi[0] += delx*fpair;
+          fi[1] += dely*fpair;
+          ti += fpair*(dxi*dely - dyi*delx);
+
+          if (NEWTON_PAIR || j < nlocal) {
+            fj[0] -= delx*fpair;
+            fj[1] -= dely*fpair;
+            tj += fpair*(dxj*dely - dyj*delx);
+          }
+        }
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[j][0] += fj[0];
+        f[j][1] += fj[1];
+        torque[i][2] += ti;
+        torque[j][2] += tj;
+
+      // particle/line interaction = Nx1 particles
+      // convert line into Np particles based on sigma and line length
+
+      } else if (line[j] >= 0) {
+        npj = dnum[j];
+        jfirst = dfirst[j];
+
+        fi[0] = fi[1] = fj[0] = fj[1] = ti = tj = 0.0;
+
+        for (nj = 0; nj < npj; nj++) {
+          dxj = discrete[jfirst+nj].dx;
+          dyj = discrete[jfirst+nj].dy;
+
+          xi[0] = x[i][0];
+          xi[1] = x[i][1];
+          xj[0] = x[j][0] + dxj;
+          xj[1] = x[j][1] + dyj;
+
+          delx = xi[0] - xj[0];
+          dely = xi[1] - xj[1];
+          rsq = delx*delx + dely*dely;
+
+          sig = 0.5 * (sigma[itype][itype]+discrete[jfirst+nj].sigma);
+          sig3 = sig*sig*sig;
+          term2 = 24.0*epsilon[itype][jtype] * sig3*sig3;
+          term1 = 2.0 * term2 * sig3*sig3;
+          r2inv = 1.0/rsq;
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (term1*r6inv - term2);
+          fpair = forcelj*r2inv;
+
+          if (EFLAG) evdwl += r6inv*(term1/12.0*r6inv-term2/6.0);
+
+          fi[0] += delx*fpair;
+          fi[1] += dely*fpair;
+          ti += fpair*(dxi*dely - dyi*delx);
+
+          if (NEWTON_PAIR || j < nlocal) {
+            fj[0] -= delx*fpair;
+            fj[1] -= dely*fpair;
+            tj -= fpair*(dxj*dely - dyj*delx);
+          }
+        }
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[j][0] += fj[0];
+        f[j][1] += fj[1];
+        torque[i][2] += ti;
+        torque[j][2] += tj;
+
+      // particle/particle interaction = 1x1 particles
+
+      } else {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        fpair = forcelj*r2inv;
+
+        if (EFLAG)
+          evdwl += r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+      }
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLineLJOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLineLJ::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_line_lj_omp.h b/src/USER-OMP/pair_line_lj_omp.h
new file mode 100644
index 0000000000..9a5e4333e0
--- /dev/null
+++ b/src/USER-OMP/pair_line_lj_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(line/lj/omp,PairLineLJOMP)
+
+#else
+
+#ifndef LMP_PAIR_LINE_LJ_OMP_H
+#define LMP_PAIR_LINE_LJ_OMP_H
+
+#include "pair_line_lj.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLineLJOMP : public PairLineLJ, public ThrOMP {
+
+ public:
+  PairLineLJOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj96_cut_omp.cpp b/src/USER-OMP/pair_lj96_cut_omp.cpp
new file mode 100644
index 0000000000..16827c7ee1
--- /dev/null
+++ b/src/USER-OMP/pair_lj96_cut_omp.cpp
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj96_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJ96CutOMP::PairLJ96CutOMP(LAMMPS *lmp) :
+  PairLJ96Cut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJ96CutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        r3inv = sqrt(r6inv);
+
+        forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype])
+            - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJ96CutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJ96Cut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj96_cut_omp.h b/src/USER-OMP/pair_lj96_cut_omp.h
new file mode 100644
index 0000000000..037c68fc03
--- /dev/null
+++ b/src/USER-OMP/pair_lj96_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj96/cut/omp,PairLJ96CutOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ96_CUT_OMP_H
+#define LMP_PAIR_LJ96_CUT_OMP_H
+
+#include "pair_lj96_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJ96CutOMP : public PairLJ96Cut, public ThrOMP {
+
+ public:
+  PairLJ96CutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_charmm_coul_charmm_implicit_omp.cpp b/src/USER-OMP/pair_lj_charmm_coul_charmm_implicit_omp.cpp
new file mode 100644
index 0000000000..739f754a23
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_charmm_implicit_omp.cpp
@@ -0,0 +1,210 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_charmm_coul_charmm_implicit_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmImplicitOMP::PairLJCharmmCoulCharmmImplicitOMP(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmmImplicit(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCharmmCoulCharmmImplicitOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double philj,switch1,switch2;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  const double invdenom_coul = (denom_coul != 0.0) ? 1.0/denom_coul : 0.0;
+  const double invdenom_lj   = (denom_lj   != 0.0) ? 1.0/denom_lj   : 0.0;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cut_bothsq) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          forcecoul = 2.0 * qqrd2e * qtmp*q[j]*r2inv;
+          if (rsq > cut_coul_innersq) {
+            switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) * invdenom_coul;
+            switch2 = 12.0*rsq * (cut_coulsq-rsq) *
+              (rsq-cut_coul_innersq) * invdenom_coul;
+            forcecoul *= switch1 + switch2;
+          }
+          forcecoul *= factor_coul;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          jtype = type[j];
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+              (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * invdenom_lj;
+            switch2 = 12.0*rsq * (cut_ljsq-rsq) *
+              (rsq-cut_lj_innersq) * invdenom_lj;
+            philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+            forcelj = forcelj*switch1 + philj*switch2;
+          }
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = qqrd2e * qtmp*q[j]*r2inv;
+            if (rsq > cut_coul_innersq) {
+              switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+                (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) *
+                invdenom_coul;
+              ecoul *= switch1;
+            }
+            ecoul *= factor_coul;
+          } else ecoul = 0.0;
+          if (rsq < cut_ljsq) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+            if (rsq > cut_lj_innersq) {
+              switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+                (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * invdenom_lj;
+              evdwl *= switch1;
+            }
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCharmmCoulCharmmImplicitOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCharmmCoulCharmmImplicit::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_charmm_coul_charmm_implicit_omp.h b/src/USER-OMP/pair_lj_charmm_coul_charmm_implicit_omp.h
new file mode 100644
index 0000000000..9fb2835353
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_charmm_implicit_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/implicit/omp,PairLJCharmmCoulCharmmImplicitOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_OMP_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_OMP_H
+
+#include "pair_lj_charmm_coul_charmm_implicit.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmImplicitOMP : public PairLJCharmmCoulCharmmImplicit, public ThrOMP {
+
+ public:
+  PairLJCharmmCoulCharmmImplicitOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_charmm_coul_charmm_omp.cpp b/src/USER-OMP/pair_lj_charmm_coul_charmm_omp.cpp
new file mode 100644
index 0000000000..318cb3aaa8
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_charmm_omp.cpp
@@ -0,0 +1,210 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_charmm_coul_charmm_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmOMP::PairLJCharmmCoulCharmmOMP(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmm(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCharmmCoulCharmmOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double philj,switch1,switch2;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  const double invdenom_coul = (denom_coul != 0.0) ? 1.0/denom_coul : 0.0;
+  const double invdenom_lj   = (denom_lj   != 0.0) ? 1.0/denom_lj   : 0.0;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cut_bothsq) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+          if (rsq > cut_coul_innersq) {
+            switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) * invdenom_coul;
+            switch2 = 12.0*rsq * (cut_coulsq-rsq) *
+              (rsq-cut_coul_innersq) * invdenom_coul;
+            forcecoul *= switch1 + switch2;
+          }
+          forcecoul *= factor_coul;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          jtype = type[j];
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+              (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * invdenom_lj;
+            switch2 = 12.0*rsq * (cut_ljsq-rsq) *
+              (rsq-cut_lj_innersq) * invdenom_lj;
+            philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+            forcelj = forcelj*switch1 + philj*switch2;
+          }
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+            if (rsq > cut_coul_innersq) {
+              switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+                (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) *
+                invdenom_coul;
+              ecoul *= switch1;
+            }
+            ecoul *= factor_coul;
+          } else ecoul = 0.0;
+          if (rsq < cut_ljsq) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+            if (rsq > cut_lj_innersq) {
+              switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+                (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * invdenom_lj;
+              evdwl *= switch1;
+            }
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCharmmCoulCharmmOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCharmmCoulCharmm::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_charmm_coul_charmm_omp.h b/src/USER-OMP/pair_lj_charmm_coul_charmm_omp.h
new file mode 100644
index 0000000000..9591d62c85
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_charmm_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/omp,PairLJCharmmCoulCharmmOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_OMP_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_OMP_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmOMP : public PairLJCharmmCoulCharmm, public ThrOMP {
+
+ public:
+  PairLJCharmmCoulCharmmOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_charmm_coul_long_omp.cpp b/src/USER-OMP/pair_lj_charmm_coul_long_omp.cpp
new file mode 100644
index 0000000000..afdc527567
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_long_omp.cpp
@@ -0,0 +1,223 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_charmm_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongOMP::PairLJCharmmCoulLongOMP(LAMMPS *lmp) :
+  PairLJCharmmCoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCharmmCoulLongOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  const double inv_denom_lj = 1.0/denom_lj;
+
+  const int * const ilist = list->ilist;
+  const int * const numneigh = list->numneigh;
+  const int * const * const firstneigh = list->firstneigh;
+  const int nlocal = atom->nlocal;
+
+  // loop over neighbors of my atoms
+
+  for (int ii = iifrom; ii < iito; ++ii) {
+
+    const int i = ilist[ii];
+    const int itype = type[i];
+    const double qtmp = q[i];
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    double fxtmp,fytmp,fztmp;
+    fxtmp=fytmp=fztmp=0.0;
+
+    const int * const jlist = firstneigh[i];
+    const int jnum = numneigh[i];
+
+    for (int jj = 0; jj < jnum; jj++) {
+      double forcecoul, forcelj, evdwl, ecoul;
+      forcecoul = forcelj = evdwl = ecoul = 0.0;
+
+      const int sbindex = sbmask(jlist[jj]);
+      const int j = jlist[jj] & NEIGHMASK;
+
+      const double delx = xtmp - x[j][0];
+      const double dely = ytmp - x[j][1];
+      const double delz = ztmp - x[j][2];
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      const int jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        const double r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            const double A1 =  0.254829592;
+            const double A2 = -0.284496736;
+            const double A3 =  1.421413741;
+            const double A4 = -1.453152027;
+            const double A5 =  1.061405429;
+            const double EWALD_F = 1.12837917;
+            const double INV_EWALD_P = 1.0/0.3275911;
+
+            const double r = sqrt(rsq);
+            const double grij = g_ewald * r;
+            const double expm2 = exp(-grij*grij);
+            const double t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const double prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (EFLAG) ecoul = prefactor*erfc;
+            if (sbindex) {
+              const double adjust = (1.0-special_coul[sbindex])*prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            const int itable = (rsq_lookup.i & ncoulmask) >> ncoulshiftbits;
+            const double fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            const double table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (EFLAG) ecoul = qtmp*q[j] * (etable[itable] + fraction*detable[itable]);
+            if (sbindex) {
+              const double table2 = ctable[itable] + fraction*dctable[itable];
+              const double prefactor = qtmp*q[j] * table2;
+              const double adjust = (1.0-special_coul[sbindex])*prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
+          }
+        }
+
+        if (rsq < cut_ljsq) {
+          const double r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          if (EFLAG) evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+
+          if (rsq > cut_lj_innersq) {
+            const double drsq = cut_ljsq - rsq;
+            const double cut2 = (rsq - cut_lj_innersq) * drsq;
+            const double switch1 = drsq * (drsq*drsq + 3.0*cut2) * inv_denom_lj;
+            const double switch2 = 12.0*rsq * cut2 * inv_denom_lj;
+            if (EFLAG) {
+              forcelj = forcelj*switch1 + evdwl*switch2;
+              evdwl *= switch1;
+            } else {
+              const double philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+              forcelj =  forcelj*switch1 + philj*switch2;
+            }
+          }
+
+          if (sbindex) {
+            const double factor_lj = special_lj[sbindex];
+            forcelj *= factor_lj;
+            if (EFLAG) evdwl *= factor_lj;
+          }
+
+        }
+        const double fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCharmmCoulLongOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCharmmCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_charmm_coul_long_omp.h b/src/USER-OMP/pair_lj_charmm_coul_long_omp.h
new file mode 100644
index 0000000000..ba28d05114
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_long_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/long/omp,PairLJCharmmCoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_OMP_H
+#define LMP_PAIR_LJ_CHARMM_COUL_LONG_OMP_H
+
+#include "pair_lj_charmm_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulLongOMP : public PairLJCharmmCoulLong, public ThrOMP {
+
+ public:
+  PairLJCharmmCoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_charmm_coul_pppm_omp.cpp b/src/USER-OMP/pair_lj_charmm_coul_pppm_omp.cpp
new file mode 100644
index 0000000000..b5878dfe3d
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_pppm_omp.cpp
@@ -0,0 +1,261 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_charmm_coul_pppm_omp.h"
+#include "pppm_proxy.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+
+#include <string.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulPPPMOMP::PairLJCharmmCoulPPPMOMP(LAMMPS *lmp) :
+  PairLJCharmmCoulLong(lmp), ThrOMP(lmp, THR_PAIR|THR_PROXY)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  nproxy=1;
+
+  kspace = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulPPPMOMP::init_style()
+{
+  if (comm->nthreads < 2)
+    error->all(FLERR,"need at least two threads per MPI task for this pair style");
+
+  if (strcmp(force->kspace_style,"pppm/proxy") != 0)
+    error->all(FLERR,"kspace style pppm/proxy is required with this pair style");
+
+  kspace = static_cast<PPPMProxy *>(force->kspace);
+
+  PairLJCharmmCoulLong::init_style();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulPPPMOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads, nproxy);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // thread id 0 runs pppm, the rest the pair style
+    if (tid < nproxy) {
+      kspace->compute_proxy(eflag,vflag);
+    } else {
+      if (evflag) {
+        if (eflag) {
+          if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+          else eval<1,1,0>(ifrom, ito, thr);
+        } else {
+          if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+          else eval<1,0,0>(ifrom, ito, thr);
+        }
+      } else {
+        if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+        else eval<0,0,0>(ifrom, ito, thr);
+      }
+    }
+
+    sync_threads();
+    reduce_thr(this, eflag, vflag, thr, nproxy);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCharmmCoulPPPMOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double fraction,table;
+  double r,rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  double philj,switch1,switch2;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            r = sqrt(rsq);
+            grij = g_ewald * r;
+            expm2 = exp(-grij*grij);
+            t = 1.0 / (1.0 + EWALD_P*grij);
+            erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (factor_coul < 1.0) {
+              table = ctable[itable] + fraction*dctable[itable];
+              prefactor = qtmp*q[j] * table;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          jtype = type[j];
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+              (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
+            switch2 = 12.0*rsq * (cut_ljsq-rsq) *
+              (rsq-cut_lj_innersq) / denom_lj;
+            philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+            forcelj = forcelj*switch1 + philj*switch2;
+          }
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            if (!ncoultablebits || rsq <= tabinnersq)
+              ecoul = prefactor*erfc;
+            else {
+              table = etable[itable] + fraction*detable[itable];
+              ecoul = qtmp*q[j] * table;
+            }
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+            if (rsq > cut_lj_innersq) {
+              switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+                (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
+              evdwl *= switch1;
+            }
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCharmmCoulPPPMOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCharmmCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_charmm_coul_pppm_omp.h b/src/USER-OMP/pair_lj_charmm_coul_pppm_omp.h
new file mode 100644
index 0000000000..835f40389f
--- /dev/null
+++ b/src/USER-OMP/pair_lj_charmm_coul_pppm_omp.h
@@ -0,0 +1,54 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/pppm/omp,PairLJCharmmCoulPPPMOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_PPPM_OMP_H
+#define LMP_PAIR_LJ_CHARMM_COUL_PPPM_OMP_H
+
+#include "pair_lj_charmm_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulPPPMOMP : public PairLJCharmmCoulLong, public ThrOMP {
+
+ public:
+  PairLJCharmmCoulPPPMOMP(class LAMMPS *);
+  virtual ~PairLJCharmmCoulPPPMOMP() {};
+
+  virtual void init_style();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+
+  class PPPMProxy *kspace;
+  int nproxy;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_class2_coul_cut_omp.cpp b/src/USER-OMP/pair_lj_class2_coul_cut_omp.cpp
new file mode 100644
index 0000000000..361923cb54
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_coul_cut_omp.cpp
@@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_class2_coul_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2CoulCutOMP::PairLJClass2CoulCutOMP(LAMMPS *lmp) :
+  PairLJClass2CoulCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJClass2CoulCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,rinv,r2inv,r3inv,r6inv,forcecoul,forcelj;
+  double factor_coul,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        rinv = sqrt(r2inv);
+
+        if (rsq < cut_coulsq[itype][jtype]) {
+          forcecoul = qqrd2e * qtmp*q[j]*rinv;
+          forcecoul *= factor_coul;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r3inv = r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq[itype][jtype])
+            ecoul = factor_coul * qqrd2e * qtmp*q[j]*rinv;
+          else ecoul = 0.0;
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJClass2CoulCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJClass2CoulCut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_class2_coul_cut_omp.h b/src/USER-OMP/pair_lj_class2_coul_cut_omp.h
new file mode 100644
index 0000000000..f03da67a15
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_coul_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/coul/cut/omp,PairLJClass2CoulCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_COUL_CUT_OMP_H
+#define LMP_PAIR_LJ_CLASS2_COUL_CUT_OMP_H
+
+#include "pair_lj_class2_coul_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2CoulCutOMP : public PairLJClass2CoulCut, public ThrOMP {
+
+ public:
+  PairLJClass2CoulCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_class2_coul_long_omp.cpp b/src/USER-OMP/pair_lj_class2_coul_long_omp.cpp
new file mode 100644
index 0000000000..6049d7a98c
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_coul_long_omp.cpp
@@ -0,0 +1,199 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_class2_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2CoulLongOMP::PairLJClass2CoulLongOMP(LAMMPS *lmp) :
+  PairLJClass2CoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJClass2CoulLongOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double r,rsq,rinv,r2inv,r3inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          r = sqrt(rsq);
+          grij = g_ewald * r;
+          expm2 = exp(-grij*grij);
+          t = 1.0 / (1.0 + EWALD_P*grij);
+          erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*q[j]/r;
+          forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+          if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          rinv = sqrt(r2inv);
+          r3inv = r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = prefactor*erfc;
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJClass2CoulLongOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJClass2CoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_class2_coul_long_omp.h b/src/USER-OMP/pair_lj_class2_coul_long_omp.h
new file mode 100644
index 0000000000..973850c25c
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_coul_long_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/coul/long/omp,PairLJClass2CoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_COUL_LONG_OMP_H
+#define LMP_PAIR_LJ_CLASS2_COUL_LONG_OMP_H
+
+#include "pair_lj_class2_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2CoulLongOMP : public PairLJClass2CoulLong, public ThrOMP {
+
+ public:
+  PairLJClass2CoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_class2_coul_pppm_omp.cpp b/src/USER-OMP/pair_lj_class2_coul_pppm_omp.cpp
new file mode 100644
index 0000000000..1b0ca0ec7d
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_coul_pppm_omp.cpp
@@ -0,0 +1,228 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_class2_coul_pppm_omp.h"
+#include "pppm_proxy.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+
+#include <string.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2CoulPPPMOMP::PairLJClass2CoulPPPMOMP(LAMMPS *lmp) :
+  PairLJClass2CoulLong(lmp), ThrOMP(lmp, THR_PAIR|THR_PROXY)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  nproxy=1;
+
+  kspace = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulPPPMOMP::init_style()
+{
+  if (comm->nthreads < 2)
+    error->all(FLERR,"need at least two threads per MPI task for this pair style");
+
+  if (strcmp(force->kspace_style,"pppm/proxy") != 0)
+    error->all(FLERR,"kspace style pppm/proxy is required with this pair style");
+
+  kspace = static_cast<PPPMProxy *>(force->kspace);
+
+  PairLJClass2CoulLong::init_style();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulPPPMOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads, nproxy);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // thread id 0 runs pppm, the rest the pair style
+    if (tid < nproxy) {
+      kspace->compute_proxy(eflag,vflag);
+    } else {
+      if (evflag) {
+        if (eflag) {
+          if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+          else eval<1,1,0>(ifrom, ito, thr);
+        } else {
+          if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+          else eval<1,0,0>(ifrom, ito, thr);
+        }
+      } else {
+        if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+        else eval<0,0,0>(ifrom, ito, thr);
+      }
+    }
+
+    sync_threads();
+    reduce_thr(this, eflag, vflag, thr, nproxy);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJClass2CoulPPPMOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double r,rsq,rinv,r2inv,r3inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          r = sqrt(rsq);
+          grij = g_ewald * r;
+          expm2 = exp(-grij*grij);
+          t = 1.0 / (1.0 + EWALD_P*grij);
+          erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*q[j]/r;
+          forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+          if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          rinv = sqrt(r2inv);
+          r3inv = r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = prefactor*erfc;
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJClass2CoulPPPMOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJClass2CoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_class2_coul_pppm_omp.h b/src/USER-OMP/pair_lj_class2_coul_pppm_omp.h
new file mode 100644
index 0000000000..8a5def6bcc
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_coul_pppm_omp.h
@@ -0,0 +1,54 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/coul/pppm/omp,PairLJClass2CoulPPPMOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_COUL_PPPM_OMP_H
+#define LMP_PAIR_LJ_CLASS2_COUL_PPPM_OMP_H
+
+#include "pair_lj_class2_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2CoulPPPMOMP : public PairLJClass2CoulLong, public ThrOMP {
+
+ public:
+  PairLJClass2CoulPPPMOMP(class LAMMPS *);
+  virtual ~PairLJClass2CoulPPPMOMP() {};
+
+  virtual void init_style();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+
+  class PPPMProxy *kspace;
+  int nproxy;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_class2_omp.cpp b/src/USER-OMP/pair_lj_class2_omp.cpp
new file mode 100644
index 0000000000..978335d21e
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_omp.cpp
@@ -0,0 +1,160 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_class2_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2OMP::PairLJClass2OMP(LAMMPS *lmp) :
+  PairLJClass2(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2OMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJClass2OMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        r3inv = sqrt(r6inv);
+
+        forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype])
+            - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJClass2OMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJClass2::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_class2_omp.h b/src/USER-OMP/pair_lj_class2_omp.h
new file mode 100644
index 0000000000..5acd2cfb39
--- /dev/null
+++ b/src/USER-OMP/pair_lj_class2_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/omp,PairLJClass2OMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_OMP_H
+#define LMP_PAIR_LJ_CLASS2_OMP_H
+
+#include "pair_lj_class2.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2OMP : public PairLJClass2, public ThrOMP {
+
+ public:
+  PairLJClass2OMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_coul_omp.cpp b/src/USER-OMP/pair_lj_coul_omp.cpp
new file mode 100644
index 0000000000..21fd9ff8cd
--- /dev/null
+++ b/src/USER-OMP/pair_lj_coul_omp.cpp
@@ -0,0 +1,233 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_coul_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "math_vector.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCoulOMP::PairLJCoulOMP(LAMMPS *lmp) :
+  PairLJCoul(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCoulOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCoulOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  double evdwl,ecoul,fpair;
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+
+  const double *x0 = x[0];
+  double *f0 = f[0], *fi = f0;
+
+  int *ilist = list->ilist;
+
+  // loop over neighbors of my atoms
+
+  int i, ii, j, order1 = ewald_order&(1<<1), order6 = ewald_order&(1<<6);
+  int *jneigh, *jneighn, typei, typej, ni;
+  double qi, qri, *cutsqi, *cut_ljsqi, *lj1i, *lj2i, *lj3i, *lj4i, *offseti;
+  double rsq, r2inv, force_coul, force_lj;
+  double g2 = g_ewald*g_ewald, g6 = g2*g2*g2, g8 = g6*g2;
+  vector xi, d;
+
+  for (ii = iifrom; ii < iito; ++ii) {                        // loop over my atoms
+    i = ilist[ii]; fi = f0+3*i;
+    if (order1) qri = (qi = q[i])*qqrd2e;                // initialize constants
+    offseti = offset[typei = type[i]];
+    lj1i = lj1[typei]; lj2i = lj2[typei]; lj3i = lj3[typei]; lj4i = lj4[typei];
+    cutsqi = cutsq[typei]; cut_ljsqi = cut_ljsq[typei];
+    memcpy(xi, x0+(i+(i<<1)), sizeof(vector));
+    jneighn = (jneigh = list->firstneigh[i])+list->numneigh[i];
+
+    for (; jneigh<jneighn; ++jneigh) {                        // loop over neighbors
+      j = *jneigh;
+      ni = sbmask(j);
+      j &= NEIGHMASK;
+
+      { register const double *xj = x0+(j+(j<<1));
+        d[0] = xi[0] - xj[0];                                // pair vector
+        d[1] = xi[1] - xj[1];
+        d[2] = xi[2] - xj[2]; }
+
+      if ((rsq = vec_dot(d, d)) >= cutsqi[typej = type[j]]) continue;
+      r2inv = 1.0/rsq;
+
+      if (order1 && (rsq < cut_coulsq)) {                // coulombic
+        if (!ncoultablebits || rsq <= tabinnersq) {        // series real space
+          register double r = sqrt(rsq), x = g_ewald*r;
+          register double s = qri*q[j], t = 1.0/(1.0+EWALD_P*x);
+          if (ni == 0) {
+            s *= g_ewald*exp(-x*x);
+            force_coul = (t *= ((((t*A5+A4)*t+A3)*t+A2)*t+A1)*s/x)+EWALD_F*s;
+            if (EFLAG) ecoul = t;
+          }
+          else {                                        // special case
+            r = s*(1.0-special_coul[ni])/r; s *= g_ewald*exp(-x*x);
+            force_coul = (t *= ((((t*A5+A4)*t+A3)*t+A2)*t+A1)*s/x)+EWALD_F*s-r;
+            if (EFLAG) ecoul = t-r;
+          }
+        }                                                // table real space
+        else {
+          register union_int_float_t t;
+          t.f = rsq;
+          register const int k = (t.i & ncoulmask)>>ncoulshiftbits;
+          register double f = (rsq-rtable[k])*drtable[k], qiqj = qi*q[j];
+          if (ni == 0) {
+            force_coul = qiqj*(ftable[k]+f*dftable[k]);
+            if (EFLAG) ecoul = qiqj*(etable[k]+f*detable[k]);
+          }
+          else {                                        // special case
+            t.f = (1.0-special_coul[ni])*(ctable[k]+f*dctable[k]);
+            force_coul = qiqj*(ftable[k]+f*dftable[k]-t.f);
+            if (EFLAG) ecoul = qiqj*(etable[k]+f*detable[k]-t.f);
+          }
+        }
+      }
+      else force_coul = ecoul = 0.0;
+
+      if (rsq < cut_ljsqi[typej]) {                        // lj
+               if (order6) {                                        // long-range lj
+          register double rn = r2inv*r2inv*r2inv;
+          register double x2 = g2*rsq, a2 = 1.0/x2;
+          x2 = a2*exp(-x2)*lj4i[typej];
+          if (ni == 0) {
+            force_lj =
+              (rn*=rn)*lj1i[typej]-g8*(((6.0*a2+6.0)*a2+3.0)*a2+1.0)*x2*rsq;
+            if (EFLAG)
+              evdwl = rn*lj3i[typej]-g6*((a2+1.0)*a2+0.5)*x2;
+          }
+          else {                                        // special case
+            register double f = special_lj[ni], t = rn*(1.0-f);
+            force_lj = f*(rn *= rn)*lj1i[typej]-
+              g8*(((6.0*a2+6.0)*a2+3.0)*a2+1.0)*x2*rsq+t*lj2i[typej];
+            if (EFLAG)
+              evdwl = f*rn*lj3i[typej]-g6*((a2+1.0)*a2+0.5)*x2+t*lj4i[typej];
+          }
+        }
+        else {                                                // cut lj
+          register double rn = r2inv*r2inv*r2inv;
+          if (ni == 0) {
+            force_lj = rn*(rn*lj1i[typej]-lj2i[typej]);
+            if (EFLAG) evdwl = rn*(rn*lj3i[typej]-lj4i[typej])-offseti[typej];
+          }
+          else {                                        // special case
+            register double f = special_lj[ni];
+            force_lj = f*rn*(rn*lj1i[typej]-lj2i[typej]);
+            if (EFLAG)
+              evdwl = f * (rn*(rn*lj3i[typej]-lj4i[typej])-offseti[typej]);
+          }
+        }
+      }
+      else force_lj = evdwl = 0.0;
+
+      fpair = (force_coul+force_lj)*r2inv;
+
+      if (NEWTON_PAIR || j < nlocal) {
+        register double *fj = f0+(j+(j<<1)), f;
+        fi[0] += f = d[0]*fpair; fj[0] -= f;
+        fi[1] += f = d[1]*fpair; fj[1] -= f;
+        fi[2] += f = d[2]*fpair; fj[2] -= f;
+      }
+      else {
+        fi[0] += d[0]*fpair;
+        fi[1] += d[1]*fpair;
+        fi[2] += d[2]*fpair;
+      }
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                               evdwl,ecoul,fpair,d[0],d[1],d[2],thr);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCoulOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCoul::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_coul_omp.h b/src/USER-OMP/pair_lj_coul_omp.h
new file mode 100644
index 0000000000..6e988126a3
--- /dev/null
+++ b/src/USER-OMP/pair_lj_coul_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/coul/omp,PairLJCoulOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_COUL_OMP_H
+#define LMP_PAIR_LJ_COUL_OMP_H
+
+#include "pair_lj_coul.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCoulOMP : public PairLJCoul, public ThrOMP {
+
+ public:
+  PairLJCoulOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cubic_omp.cpp b/src/USER-OMP/pair_lj_cubic_omp.cpp
new file mode 100644
index 0000000000..bf4b2b2bc8
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cubic_omp.cpp
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cubic_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace PairLJCubicConstants;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCubicOMP::PairLJCubicOMP(LAMMPS *lmp) :
+  PairLJCubic(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCubicOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCubicOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double r,t,rmin;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        if (rsq <= cut_inner_sq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        } else {
+          r = sqrt(rsq);
+          rmin = sigma[itype][jtype]*RT6TWO;
+          t = (r - cut_inner[itype][jtype])/rmin;
+          forcelj = epsilon[itype][jtype]*(-DPHIDS + A3*t*t/2.0)*r/rmin;
+        }
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq <= cut_inner_sq[itype][jtype])
+            evdwl = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+          else
+            evdwl = epsilon[itype][jtype]*
+              (PHIS + DPHIDS*t - A3*t*t*t/6.0);
+
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCubicOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCubic::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cubic_omp.h b/src/USER-OMP/pair_lj_cubic_omp.h
new file mode 100644
index 0000000000..78b298bb35
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cubic_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cubic/omp,PairLJCubicOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUBIC_OMP_H
+#define LMP_PAIR_LJ_CUBIC_OMP_H
+
+#include "pair_lj_cubic.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCubicOMP : public PairLJCubic, public ThrOMP {
+
+ public:
+  PairLJCubicOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cut_coul_cut_omp.cpp b/src/USER-OMP/pair_lj_cut_coul_cut_omp.cpp
new file mode 100644
index 0000000000..6c1bdedef4
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_cut_omp.cpp
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulCutOMP::PairLJCutCoulCutOMP(LAMMPS *lmp) :
+  PairLJCutCoulCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCutCoulCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,rinv,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq[itype][jtype]) {
+          rinv = sqrt(r2inv);
+          forcecoul = qqrd2e * qtmp*q[j]*rinv;
+          forcecoul *= factor_coul;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq[itype][jtype])
+            ecoul = factor_coul * qqrd2e * qtmp*q[j]*rinv;
+          else ecoul = 0.0;
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCutCoulCut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cut_coul_cut_omp.h b/src/USER-OMP/pair_lj_cut_coul_cut_omp.h
new file mode 100644
index 0000000000..0d5de5b7b7
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/cut/omp,PairLJCutCoulCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_OMP_H
+#define LMP_PAIR_LJ_CUT_COUL_CUT_OMP_H
+
+#include "pair_lj_cut_coul_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulCutOMP : public PairLJCutCoulCut, public ThrOMP {
+
+ public:
+  PairLJCutCoulCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cut_coul_debye_omp.cpp b/src/USER-OMP/pair_lj_cut_coul_debye_omp.cpp
new file mode 100644
index 0000000000..81a73d78b1
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_debye_omp.cpp
@@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_debye_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulDebyeOMP::PairLJCutCoulDebyeOMP(LAMMPS *lmp) :
+  PairLJCutCoulDebye(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCutCoulDebyeOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double r,rinv,screening;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq[itype][jtype]) {
+          r = sqrt(rsq);
+          rinv = 1.0/r;
+          screening = exp(-kappa*r);
+          forcecoul = qqrd2e * qtmp*q[j] * screening * (kappa + rinv);
+          forcecoul *= factor_coul;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq[itype][jtype])
+            ecoul = factor_coul * qqrd2e * qtmp*q[j] * rinv * screening;
+          else ecoul = 0.0;
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulDebyeOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCutCoulDebye::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cut_coul_debye_omp.h b/src/USER-OMP/pair_lj_cut_coul_debye_omp.h
new file mode 100644
index 0000000000..b327fc665f
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_debye_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/debye/omp,PairLJCutCoulDebyeOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_DEBYE_OMP_H
+#define LMP_PAIR_LJ_CUT_COUL_DEBYE_OMP_H
+
+#include "pair_lj_cut_coul_debye.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulDebyeOMP : public PairLJCutCoulDebye, public ThrOMP {
+
+ public:
+  PairLJCutCoulDebyeOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cut_coul_long_omp.cpp b/src/USER-OMP/pair_lj_cut_coul_long_omp.cpp
new file mode 100644
index 0000000000..19639ac731
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_long_omp.cpp
@@ -0,0 +1,219 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongOMP::PairLJCutCoulLongOMP(LAMMPS *lmp) :
+  PairLJCutCoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCutCoulLongOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double fraction,table;
+  double r,rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            r = sqrt(rsq);
+            grij = g_ewald * r;
+            expm2 = exp(-grij*grij);
+            t = 1.0 / (1.0 + EWALD_P*grij);
+            erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (factor_coul < 1.0) {
+              table = ctable[itable] + fraction*dctable[itable];
+              prefactor = qtmp*q[j] * table;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            if (!ncoultablebits || rsq <= tabinnersq)
+              ecoul = prefactor*erfc;
+            else {
+              table = etable[itable] + fraction*detable[itable];
+              ecoul = qtmp*q[j] * table;
+            }
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulLongOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCutCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cut_coul_long_omp.h b/src/USER-OMP/pair_lj_cut_coul_long_omp.h
new file mode 100644
index 0000000000..cea3271af0
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_long_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/long/omp,PairLJCutCoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_OMP_H
+#define LMP_PAIR_LJ_CUT_COUL_LONG_OMP_H
+
+#include "pair_lj_cut_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulLongOMP : public PairLJCutCoulLong, public ThrOMP {
+
+ public:
+  PairLJCutCoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cut_coul_long_tip4p_omp.cpp b/src/USER-OMP/pair_lj_cut_coul_long_tip4p_omp.cpp
new file mode 100644
index 0000000000..640fa836b5
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_long_tip4p_omp.cpp
@@ -0,0 +1,513 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_long_tip4p_omp.h"
+#include "atom.h"
+#include "domain.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "error.h"
+#include "memory.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongTIP4POMP::PairLJCutCoulLongTIP4POMP(LAMMPS *lmp) :
+  PairLJCutCoulLongTIP4P(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+
+  // TIP4P cannot compute virial as F dot r
+  // due to finding bonded H atoms which are not near O atom
+
+  no_virial_fdotr_compute = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongTIP4POMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  // reallocate hneigh & newsite if necessary
+  // initialize hneigh[0] to -1 on steps when reneighboring occurred
+  // initialize hneigh[2] to 0 every step
+
+  if (atom->nmax > nmax) {
+    nmax = atom->nmax;
+    memory->destroy(hneigh);
+    memory->create(hneigh,nmax,3,"pair:hneigh");
+    memory->destroy(newsite);
+    memory->create(newsite,nmax,3,"pair:newsite");
+  }
+
+  // XXX: this could be threaded, too.
+  int i;
+  if (neighbor->ago == 0)
+    for (i = 0; i < nall; i++) hneigh[i][0] = -1;
+  for (i = 0; i < nall; i++) hneigh[i][2] = 0;
+
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+  if (!ncoultablebits) {
+    if (evflag) {
+      if (eflag) {
+        if (vflag) eval<1,1,1,1>(ifrom, ito, thr);
+        else eval<1,1,1,0>(ifrom, ito, thr);
+      } else {
+        if (vflag) eval<1,1,0,1>(ifrom, ito, thr);
+        else eval<1,1,0,0>(ifrom, ito, thr);
+      }
+    } else eval<1,0,0,0>(ifrom, ito, thr);
+  } else {
+    if (evflag) {
+      if (eflag) {
+        if (vflag) eval<0,1,1,1>(ifrom, ito, thr);
+        else eval<0,1,1,0>(ifrom, ito, thr);
+      } else {
+        if (vflag) eval<0,1,0,1>(ifrom, ito, thr);
+        else eval<0,1,0,0>(ifrom, ito, thr);
+      }
+    } else eval<0,0,0,0>(ifrom, ito, thr);
+  }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int CTABLE, int EVFLAG, int EFLAG, int VFLAG>
+void PairLJCutCoulLongTIP4POMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype,itable;
+  int n,vlist[6];
+  int iH1,iH2,jH1,jH2;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul;
+  double fraction,table;
+  double delxOM,delyOM,delzOM;
+  double r,rsq,r2inv,r6inv,forcecoul,forcelj,cforce;
+  double factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc,ddotf;
+  double v[6],xH1[3],xH2[3];
+  double fdx,fdy,fdz,f1x,f1y,f1z,fOx,fOy,fOz,fHx,fHy,fHz;
+  const double *x1,*x2;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int tid = thr->get_tid();
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  const double cut_coulsqplus = (cut_coul+2.0*qdist) * (cut_coul+2.0*qdist);
+
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+
+    // if atom I = water O, set x1 = offset charge site
+    // else x1 = x of atom I
+    // NOTE: to make this part thread safe, we need to
+    // make sure that the hneigh[][] entries only get
+    // updated, when all data is in place. worst case,
+    // some calculation is repeated, but since the results
+    // will be the same, there is no race condition.
+    if (itype == typeO) {
+      if (hneigh[i][0] < 0) {
+        iH1 = atom->map(atom->tag[i] + 1);
+        iH2 = atom->map(atom->tag[i] + 2);
+        if (iH1 == -1 || iH2 == -1)
+          error->one(FLERR,"TIP4P hydrogen is missing");
+        if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
+          error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
+        compute_newsite_thr(x[i],x[iH1],x[iH2],newsite[i]);
+        hneigh[i][2] = 1;
+        hneigh[i][1] = iH2;
+        hneigh[i][0] = iH1;
+      } else {
+        iH1 = hneigh[i][0];
+        iH2 = hneigh[i][1];
+        if (hneigh[i][2] == 0) {
+          compute_newsite_thr(x[i],x[iH1],x[iH2],newsite[i]);
+          hneigh[i][2] = 1;
+        }
+      }
+      x1 = newsite[i];
+    } else x1 = x[i];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      // LJ interaction based on true rsq
+
+      if (rsq < cut_ljsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        forcelj *= factor_lj * r2inv;
+
+        fxtmp += delx*forcelj;
+        fytmp += dely*forcelj;
+        fztmp += delz*forcelj;
+        f[j][0] -= delx*forcelj;
+        f[j][1] -= dely*forcelj;
+        f[j][2] -= delz*forcelj;
+
+        if (EFLAG) {
+          evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+            offset[itype][jtype];
+          evdwl *= factor_lj;
+        } else evdwl = 0.0;
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal, /* newton_pair = */ 1,
+                                 evdwl,0.0,forcelj,delx,dely,delz,thr);
+      }
+
+      // adjust rsq and delxyz for off-site O charge(s) if necessary
+      // but only if they are within reach
+      // NOTE: to make this part thread safe, we need to
+      // make sure that the hneigh[][] entries only get
+      // updated, when all data is in place. worst case,
+      // some calculation is repeated, but since the results
+      // will be the same, there is no race condition.
+      if (rsq < cut_coulsqplus) {
+        if (itype == typeO || jtype == typeO) {
+
+          // if atom J = water O, set x2 = offset charge site
+          // else x2 = x of atom J
+
+          if (jtype == typeO) {
+            if (hneigh[j][0] < 0) {
+              jH1 = atom->map(atom->tag[j] + 1);
+              jH2 = atom->map(atom->tag[j] + 2);
+              if (jH1 == -1 || jH2 == -1)
+                error->one(FLERR,"TIP4P hydrogen is missing");
+              if (atom->type[jH1] != typeH || atom->type[jH2] != typeH)
+                error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
+              compute_newsite_thr(x[j],x[jH1],x[jH2],newsite[j]);
+              hneigh[j][2] = 1;
+              hneigh[j][1] = jH2;
+              hneigh[j][0] = jH1;
+            } else {
+              jH1 = hneigh[j][0];
+              jH2 = hneigh[j][1];
+              if (hneigh[j][2] == 0) {
+                compute_newsite_thr(x[j],x[jH1],x[jH2],newsite[j]);
+                hneigh[j][2] = 1;
+              }
+            }
+            x2 = newsite[j];
+          } else x2 = x[j];
+
+          delx = x1[0] - x2[0];
+          dely = x1[1] - x2[1];
+          delz = x1[2] - x2[2];
+          rsq = delx*delx + dely*dely + delz*delz;
+        }
+
+        // Coulombic interaction based on modified rsq
+
+        if (rsq < cut_coulsq) {
+          r2inv = 1 / rsq;
+          if (CTABLE || rsq <= tabinnersq) {
+            r = sqrt(rsq);
+            grij = g_ewald * r;
+            expm2 = exp(-grij*grij);
+            t = 1.0 / (1.0 + EWALD_P*grij);
+            erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (factor_coul < 1.0) {
+              forcecoul -= (1.0-factor_coul)*prefactor;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (factor_coul < 1.0) {
+              table = ctable[itable] + fraction*dctable[itable];
+              prefactor = qtmp*q[j] * table;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+
+          cforce = forcecoul * r2inv;
+
+          // if i,j are not O atoms, force is applied directly
+          // if i or j are O atoms, force is on fictitious atom & partitioned
+          // force partitioning due to Feenstra, J Comp Chem, 20, 786 (1999)
+          // f_f = fictitious force, fO = f_f (1 - 2 alpha), fH = alpha f_f
+          // preserves total force and torque on water molecule
+          // virial = sum(r x F) where each water's atoms are near xi and xj
+          // vlist stores 2,4,6 atoms whose forces contribute to virial
+
+          n = 0;
+
+          if (itype != typeO) {
+            fxtmp += delx * cforce;
+            fytmp += dely * cforce;
+            fztmp += delz * cforce;
+
+            if (VFLAG) {
+              v[0] = x[i][0] * delx * cforce;
+              v[1] = x[i][1] * dely * cforce;
+              v[2] = x[i][2] * delz * cforce;
+              v[3] = x[i][0] * dely * cforce;
+              v[4] = x[i][0] * delz * cforce;
+              v[5] = x[i][1] * delz * cforce;
+              vlist[n++] = i;
+            }
+
+          } else {
+
+            fdx = delx*cforce;
+            fdy = dely*cforce;
+            fdz = delz*cforce;
+
+            delxOM = x[i][0] - x1[0];
+            delyOM = x[i][1] - x1[1];
+            delzOM = x[i][2] - x1[2];
+
+            ddotf = (delxOM * fdx + delyOM * fdy + delzOM * fdz) /
+              (qdist*qdist);
+
+            f1x = alpha * (fdx - ddotf * delxOM);
+            f1y = alpha * (fdy - ddotf * delyOM);
+            f1z = alpha * (fdz - ddotf * delzOM);
+
+            fOx = fdx - f1x;
+            fOy = fdy - f1y;
+            fOz = fdz - f1z;
+
+            fHx = 0.5 * f1x;
+            fHy = 0.5 * f1y;
+            fHz = 0.5 * f1z;
+
+            fxtmp += fOx;
+            fytmp += fOy;
+            fztmp += fOz;
+
+            f[iH1][0] += fHx;
+            f[iH1][1] += fHy;
+            f[iH1][2] += fHz;
+
+            f[iH2][0] += fHx;
+            f[iH2][1] += fHy;
+            f[iH2][2] += fHz;
+
+            if (VFLAG) {
+              domain->closest_image(x[i],x[iH1],xH1);
+              domain->closest_image(x[i],x[iH2],xH2);
+
+              v[0] = x[i][0]*fOx + xH1[0]*fHx + xH2[0]*fHx;
+              v[1] = x[i][1]*fOy + xH1[1]*fHy + xH2[1]*fHy;
+              v[2] = x[i][2]*fOz + xH1[2]*fHz + xH2[2]*fHz;
+              v[3] = x[i][0]*fOy + xH1[0]*fHy + xH2[0]*fHy;
+              v[4] = x[i][0]*fOz + xH1[0]*fHz + xH2[0]*fHz;
+              v[5] = x[i][1]*fOz + xH1[1]*fHz + xH2[1]*fHz;
+
+              vlist[n++] = i;
+              vlist[n++] = iH1;
+              vlist[n++] = iH2;
+            }
+          }
+
+          if (jtype != typeO) {
+            f[j][0] -= delx * cforce;
+            f[j][1] -= dely * cforce;
+            f[j][2] -= delz * cforce;
+
+            if (VFLAG) {
+              v[0] -= x[j][0] * delx * cforce;
+              v[1] -= x[j][1] * dely * cforce;
+              v[2] -= x[j][2] * delz * cforce;
+              v[3] -= x[j][0] * dely * cforce;
+              v[4] -= x[j][0] * delz * cforce;
+              v[5] -= x[j][1] * delz * cforce;
+              vlist[n++] = j;
+            }
+
+          } else {
+
+            fdx = -delx*cforce;
+            fdy = -dely*cforce;
+            fdz = -delz*cforce;
+
+            delxOM = x[j][0] - x2[0];
+            delyOM = x[j][1] - x2[1];
+            delzOM = x[j][2] - x2[2];
+
+            ddotf = (delxOM * fdx + delyOM * fdy + delzOM * fdz) /
+              (qdist*qdist);
+
+            f1x = alpha * (fdx - ddotf * delxOM);
+            f1y = alpha * (fdy - ddotf * delyOM);
+            f1z = alpha * (fdz - ddotf * delzOM);
+
+            fOx = fdx - f1x;
+            fOy = fdy - f1y;
+            fOz = fdz - f1z;
+
+            fHx = 0.5 * f1x;
+            fHy = 0.5 * f1y;
+            fHz = 0.5 * f1z;
+
+            f[j][0] += fOx;
+            f[j][1] += fOy;
+            f[j][2] += fOz;
+
+            f[jH1][0] += fHx;
+            f[jH1][1] += fHy;
+            f[jH1][2] += fHz;
+
+            f[jH2][0] += fHx;
+            f[jH2][1] += fHy;
+            f[jH2][2] += fHz;
+
+            if (VFLAG) {
+              domain->closest_image(x[j],x[jH1],xH1);
+              domain->closest_image(x[j],x[jH2],xH2);
+
+              v[0] += x[j][0]*fOx + xH1[0]*fHx + xH2[0]*fHx;
+              v[1] += x[j][1]*fOy + xH1[1]*fHy + xH2[1]*fHy;
+              v[2] += x[j][2]*fOz + xH1[2]*fHz + xH2[2]*fHz;
+              v[3] += x[j][0]*fOy + xH1[0]*fHy + xH2[0]*fHy;
+              v[4] += x[j][0]*fOz + xH1[0]*fHz + xH2[0]*fHz;
+              v[5] += x[j][1]*fOz + xH1[1]*fHz + xH2[1]*fHz;
+
+              vlist[n++] = j;
+              vlist[n++] = jH1;
+              vlist[n++] = jH2;
+            }
+          }
+
+          if (EFLAG) {
+            if (CTABLE || rsq <= tabinnersq)
+              ecoul = prefactor*erfc;
+            else {
+              table = etable[itable] + fraction*detable[itable];
+              ecoul = qtmp*q[j] * table;
+            }
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+
+          if (EVFLAG) ev_tally_list_thr(this,n,vlist,ecoul,v,thr);
+        }
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+  compute position xM of fictitious charge site for O atom and 2 H atoms
+  return it as xM
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulLongTIP4POMP::compute_newsite_thr(const double * xO,
+                                                    const double * xH1,
+                                                    const double * xH2,
+                                                    double * xM) const
+{
+  double delx1 = xH1[0] - xO[0];
+  double dely1 = xH1[1] - xO[1];
+  double delz1 = xH1[2] - xO[2];
+  domain->minimum_image(delx1,dely1,delz1);
+
+  double delx2 = xH2[0] - xO[0];
+  double dely2 = xH2[1] - xO[1];
+  double delz2 = xH2[2] - xO[2];
+  domain->minimum_image(delx2,dely2,delz2);
+
+  const double prefac = alpha * 0.5;
+  xM[0] = xO[0] + prefac * (delx1 + delx2);
+  xM[1] = xO[1] + prefac * (dely1 + dely2);
+  xM[2] = xO[2] + prefac * (delz1 + delz2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulLongTIP4POMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCutCoulLongTIP4P::memory_usage();
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cut_coul_long_tip4p_omp.h b/src/USER-OMP/pair_lj_cut_coul_long_tip4p_omp.h
new file mode 100644
index 0000000000..abb9c46ba8
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_long_tip4p_omp.h
@@ -0,0 +1,51 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/long/tip4p/omp,PairLJCutCoulLongTIP4POMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_TIP4P_OMP_H
+#define LMP_PAIR_LJ_CUT_COUL_LONG_TIP4P_OMP_H
+
+#include "pair_lj_cut_coul_long_tip4p.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulLongTIP4POMP : public PairLJCutCoulLongTIP4P, public ThrOMP {
+
+ public:
+  PairLJCutCoulLongTIP4POMP(class LAMMPS *);
+  virtual ~PairLJCutCoulLongTIP4POMP() {};
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int CFLAG, int EVFLAG, int EFLAG, int VFLAG>
+  void eval(int ifrom, int ito, ThrData * const thr);
+  void compute_newsite_thr(const double *, const double *,
+                           const double *, double *) const;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cut_coul_pppm_omp.cpp b/src/USER-OMP/pair_lj_cut_coul_pppm_omp.cpp
new file mode 100644
index 0000000000..093f8d0216
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_pppm_omp.cpp
@@ -0,0 +1,246 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_pppm_omp.h"
+#include "pppm_proxy.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+
+#include <string.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulPPPMOMP::PairLJCutCoulPPPMOMP(LAMMPS *lmp) :
+  PairLJCutCoulLong(lmp), ThrOMP(lmp, THR_PAIR|THR_PROXY)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  nproxy=1;
+
+  kspace = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulPPPMOMP::init_style()
+{
+  if (comm->nthreads < 2)
+    error->all(FLERR,"need at least two threads per MPI task for this pair style");
+
+  if (strcmp(force->kspace_style,"pppm/proxy") != 0)
+    error->all(FLERR,"kspace style pppm/proxy is required with this pair style");
+
+  kspace = static_cast<PPPMProxy *>(force->kspace);
+
+  PairLJCutCoulLong::init_style();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulPPPMOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads, nproxy);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // thread id 0 runs pppm, the rest the pair style
+    if (tid < nproxy) {
+      kspace->compute_proxy(eflag,vflag);
+    } else {
+      if (evflag) {
+        if (eflag) {
+          if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+          else eval<1,1,0>(ifrom, ito, thr);
+        } else {
+          if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+          else eval<1,0,0>(ifrom, ito, thr);
+        }
+      } else {
+        if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+        else eval<0,0,0>(ifrom, ito, thr);
+      }
+    }
+
+    sync_threads();
+    reduce_thr(this, eflag, vflag, thr, nproxy);
+  } // end of omp parallel region
+}
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCutCoulPPPMOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double fraction,table;
+  double r,rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            r = sqrt(rsq);
+            grij = g_ewald * r;
+            expm2 = exp(-grij*grij);
+            t = 1.0 / (1.0 + EWALD_P*grij);
+            erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (factor_coul < 1.0) {
+              table = ctable[itable] + fraction*dctable[itable];
+              prefactor = qtmp*q[j] * table;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            if (!ncoultablebits || rsq <= tabinnersq)
+              ecoul = prefactor*erfc;
+            else {
+              table = etable[itable] + fraction*detable[itable];
+              ecoul = qtmp*q[j] * table;
+            }
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulPPPMOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCutCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cut_coul_pppm_omp.h b/src/USER-OMP/pair_lj_cut_coul_pppm_omp.h
new file mode 100644
index 0000000000..092996a21f
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_pppm_omp.h
@@ -0,0 +1,54 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/pppm/omp,PairLJCutCoulPPPMOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_PPPM_OMP_H
+#define LMP_PAIR_LJ_CUT_COUL_PPPM_OMP_H
+
+#include "pair_lj_cut_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulPPPMOMP : public PairLJCutCoulLong, public ThrOMP {
+
+ public:
+  PairLJCutCoulPPPMOMP(class LAMMPS *);
+  virtual ~PairLJCutCoulPPPMOMP() {};
+
+  virtual void init_style();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+
+  class PPPMProxy *kspace;
+  int nproxy;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cut_coul_pppm_tip4p_omp.cpp b/src/USER-OMP/pair_lj_cut_coul_pppm_tip4p_omp.cpp
new file mode 100644
index 0000000000..ebdd8eeab7
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_pppm_tip4p_omp.cpp
@@ -0,0 +1,542 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_pppm_tip4p_omp.h"
+#include "pppm_tip4p_proxy.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+
+#include <string.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulPPPMTIP4POMP::PairLJCutCoulPPPMTIP4POMP(LAMMPS *lmp) :
+  PairLJCutCoulLongTIP4P(lmp), ThrOMP(lmp, THR_PAIR|THR_PROXY)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  nproxy=1;
+
+  kspace = NULL;
+
+  // TIP4P cannot compute virial as F dot r
+  // due to finding bonded H atoms which are not near O atom
+
+  no_virial_fdotr_compute = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulPPPMTIP4POMP::init_style()
+{
+  if (comm->nthreads < 2)
+    error->all(FLERR,"need at least two threads per MPI task for this pair style");
+
+  if (strcmp(force->kspace_style,"pppm/tip4p/proxy") != 0)
+    error->all(FLERR,"kspace style pppm/tip4p/proxy is required with this pair style");
+
+  kspace = static_cast<PPPMTIP4PProxy *>(force->kspace);
+
+  PairLJCutCoulLongTIP4P::init_style();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulPPPMTIP4POMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  // reallocate hneigh & newsite if necessary
+  // initialize hneigh[0] to -1 on steps when reneighboring occurred
+  // initialize hneigh[2] to 0 every step
+
+  if (atom->nmax > nmax) {
+    nmax = atom->nmax;
+    memory->destroy(hneigh);
+    memory->create(hneigh,nmax,3,"pair:hneigh");
+    memory->destroy(newsite);
+    memory->create(newsite,nmax,3,"pair:newsite");
+  }
+
+  // XXX: this could be threaded, too.
+  int i;
+  if (neighbor->ago == 0)
+    for (i = 0; i < nall; i++) hneigh[i][0] = -1;
+  for (i = 0; i < nall; i++) hneigh[i][2] = 0;
+
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads, nproxy);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    // thread id 0 runs pppm, the rest the pair style
+    if (tid < nproxy) {
+      kspace->compute_proxy(eflag,vflag);
+    } else {
+      if (!ncoultablebits) {
+        if (evflag) {
+          if (eflag) {
+            if (vflag) eval<1,1,1,1>(ifrom, ito, thr);
+            else eval<1,1,1,0>(ifrom, ito, thr);
+          } else {
+            if (vflag) eval<1,1,0,1>(ifrom, ito, thr);
+            else eval<1,1,0,0>(ifrom, ito, thr);
+          }
+        } else eval<1,0,0,0>(ifrom, ito, thr);
+      } else {
+        if (evflag) {
+          if (eflag) {
+            if (vflag) eval<0,1,1,1>(ifrom, ito, thr);
+            else eval<0,1,1,0>(ifrom, ito, thr);
+          } else {
+            if (vflag) eval<0,1,0,1>(ifrom, ito, thr);
+            else eval<0,1,0,0>(ifrom, ito, thr);
+          }
+        } else eval<0,0,0,0>(ifrom, ito, thr);
+      }
+    }
+
+    sync_threads();
+    reduce_thr(this, eflag, vflag, thr, nproxy);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int CTABLE, int EVFLAG, int EFLAG, int VFLAG>
+void PairLJCutCoulPPPMTIP4POMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype,itable;
+  int n,vlist[6];
+  int iH1,iH2,jH1,jH2;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul;
+  double fraction,table;
+  double delxOM,delyOM,delzOM;
+  double r,rsq,r2inv,r6inv,forcecoul,forcelj,cforce;
+  double factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc,ddotf;
+  double v[6],xH1[3],xH2[3];
+  double fdx,fdy,fdz,f1x,f1y,f1z,fOx,fOy,fOz,fHx,fHy,fHz;
+  const double *x1,*x2;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int tid = thr->get_tid();
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  const double cut_coulsqplus = (cut_coul+2.0*qdist) * (cut_coul+2.0*qdist);
+
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+
+    // if atom I = water O, set x1 = offset charge site
+    // else x1 = x of atom I
+    // NOTE: to make this part thread safe, we need to
+    // make sure that the hneigh[][] entries only get
+    // updated, when all data is in place. worst case,
+    // some calculation is repeated, but since the results
+    // will be the same, there is no race condition.
+    if (itype == typeO) {
+      if (hneigh[i][0] < 0) {
+        iH1 = atom->map(atom->tag[i] + 1);
+        iH2 = atom->map(atom->tag[i] + 2);
+        if (iH1 == -1 || iH2 == -1)
+          error->one(FLERR,"TIP4P hydrogen is missing");
+        if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
+          error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
+        compute_newsite_thr(x[i],x[iH1],x[iH2],newsite[i]);
+        hneigh[i][2] = 1;
+        hneigh[i][1] = iH2;
+        hneigh[i][0] = iH1;
+      } else {
+        iH1 = hneigh[i][0];
+        iH2 = hneigh[i][1];
+        if (hneigh[i][2] == 0) {
+          compute_newsite_thr(x[i],x[iH1],x[iH2],newsite[i]);
+          hneigh[i][2] = 1;
+        }
+      }
+      x1 = newsite[i];
+    } else x1 = x[i];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      // LJ interaction based on true rsq
+
+      if (rsq < cut_ljsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        forcelj *= factor_lj * r2inv;
+
+        fxtmp += delx*forcelj;
+        fytmp += dely*forcelj;
+        fztmp += delz*forcelj;
+        f[j][0] -= delx*forcelj;
+        f[j][1] -= dely*forcelj;
+        f[j][2] -= delz*forcelj;
+
+        if (EFLAG) {
+          evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+            offset[itype][jtype];
+          evdwl *= factor_lj;
+        } else evdwl = 0.0;
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal, /* newton_pair = */ 1,
+                                 evdwl,0.0,forcelj,delx,dely,delz,thr);
+      }
+
+      // adjust rsq and delxyz for off-site O charge(s) if necessary
+      // but only if they are within reach
+      // NOTE: to make this part thread safe, we need to
+      // make sure that the hneigh[][] entries only get
+      // updated, when all data is in place. worst case,
+      // some calculation is repeated, but since the results
+      // will be the same, there is no race condition.
+      if (rsq < cut_coulsqplus) {
+        if (itype == typeO || jtype == typeO) {
+
+          // if atom J = water O, set x2 = offset charge site
+          // else x2 = x of atom J
+
+          if (jtype == typeO) {
+            if (hneigh[j][0] < 0) {
+              jH1 = atom->map(atom->tag[j] + 1);
+              jH2 = atom->map(atom->tag[j] + 2);
+              if (jH1 == -1 || jH2 == -1)
+                error->one(FLERR,"TIP4P hydrogen is missing");
+              if (atom->type[jH1] != typeH || atom->type[jH2] != typeH)
+                error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
+              compute_newsite_thr(x[j],x[jH1],x[jH2],newsite[j]);
+              hneigh[j][2] = 1;
+              hneigh[j][1] = jH2;
+              hneigh[j][0] = jH1;
+            } else {
+              jH1 = hneigh[j][0];
+              jH2 = hneigh[j][1];
+              if (hneigh[j][2] == 0) {
+                compute_newsite_thr(x[j],x[jH1],x[jH2],newsite[j]);
+                hneigh[j][2] = 1;
+              }
+            }
+            x2 = newsite[j];
+          } else x2 = x[j];
+
+          delx = x1[0] - x2[0];
+          dely = x1[1] - x2[1];
+          delz = x1[2] - x2[2];
+          rsq = delx*delx + dely*dely + delz*delz;
+        }
+
+        // Coulombic interaction based on modified rsq
+
+        if (rsq < cut_coulsq) {
+          r2inv = 1 / rsq;
+          if (CTABLE || rsq <= tabinnersq) {
+            r = sqrt(rsq);
+            grij = g_ewald * r;
+            expm2 = exp(-grij*grij);
+            t = 1.0 / (1.0 + EWALD_P*grij);
+            erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (factor_coul < 1.0) {
+              forcecoul -= (1.0-factor_coul)*prefactor;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (factor_coul < 1.0) {
+              table = ctable[itable] + fraction*dctable[itable];
+              prefactor = qtmp*q[j] * table;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+
+          cforce = forcecoul * r2inv;
+
+          // if i,j are not O atoms, force is applied directly
+          // if i or j are O atoms, force is on fictitious atom & partitioned
+          // force partitioning due to Feenstra, J Comp Chem, 20, 786 (1999)
+          // f_f = fictitious force, fO = f_f (1 - 2 alpha), fH = alpha f_f
+          // preserves total force and torque on water molecule
+          // virial = sum(r x F) where each water's atoms are near xi and xj
+          // vlist stores 2,4,6 atoms whose forces contribute to virial
+
+          n = 0;
+
+          if (itype != typeO) {
+            fxtmp += delx * cforce;
+            fytmp += dely * cforce;
+            fztmp += delz * cforce;
+
+            if (VFLAG) {
+              v[0] = x[i][0] * delx * cforce;
+              v[1] = x[i][1] * dely * cforce;
+              v[2] = x[i][2] * delz * cforce;
+              v[3] = x[i][0] * dely * cforce;
+              v[4] = x[i][0] * delz * cforce;
+              v[5] = x[i][1] * delz * cforce;
+              vlist[n++] = i;
+            }
+
+          } else {
+
+            fdx = delx*cforce;
+            fdy = dely*cforce;
+            fdz = delz*cforce;
+
+            delxOM = x[i][0] - x1[0];
+            delyOM = x[i][1] - x1[1];
+            delzOM = x[i][2] - x1[2];
+
+            ddotf = (delxOM * fdx + delyOM * fdy + delzOM * fdz) /
+              (qdist*qdist);
+
+            f1x = alpha * (fdx - ddotf * delxOM);
+            f1y = alpha * (fdy - ddotf * delyOM);
+            f1z = alpha * (fdz - ddotf * delzOM);
+
+            fOx = fdx - f1x;
+            fOy = fdy - f1y;
+            fOz = fdz - f1z;
+
+            fHx = 0.5 * f1x;
+            fHy = 0.5 * f1y;
+            fHz = 0.5 * f1z;
+
+            fxtmp += fOx;
+            fytmp += fOy;
+            fztmp += fOz;
+
+            f[iH1][0] += fHx;
+            f[iH1][1] += fHy;
+            f[iH1][2] += fHz;
+
+            f[iH2][0] += fHx;
+            f[iH2][1] += fHy;
+            f[iH2][2] += fHz;
+
+            if (VFLAG) {
+              domain->closest_image(x[i],x[iH1],xH1);
+              domain->closest_image(x[i],x[iH2],xH2);
+
+              v[0] = x[i][0]*fOx + xH1[0]*fHx + xH2[0]*fHx;
+              v[1] = x[i][1]*fOy + xH1[1]*fHy + xH2[1]*fHy;
+              v[2] = x[i][2]*fOz + xH1[2]*fHz + xH2[2]*fHz;
+              v[3] = x[i][0]*fOy + xH1[0]*fHy + xH2[0]*fHy;
+              v[4] = x[i][0]*fOz + xH1[0]*fHz + xH2[0]*fHz;
+              v[5] = x[i][1]*fOz + xH1[1]*fHz + xH2[1]*fHz;
+
+              vlist[n++] = i;
+              vlist[n++] = iH1;
+              vlist[n++] = iH2;
+            }
+          }
+
+          if (jtype != typeO) {
+            f[j][0] -= delx * cforce;
+            f[j][1] -= dely * cforce;
+            f[j][2] -= delz * cforce;
+
+            if (VFLAG) {
+              v[0] -= x[j][0] * delx * cforce;
+              v[1] -= x[j][1] * dely * cforce;
+              v[2] -= x[j][2] * delz * cforce;
+              v[3] -= x[j][0] * dely * cforce;
+              v[4] -= x[j][0] * delz * cforce;
+              v[5] -= x[j][1] * delz * cforce;
+              vlist[n++] = j;
+            }
+
+          } else {
+
+            fdx = -delx*cforce;
+            fdy = -dely*cforce;
+            fdz = -delz*cforce;
+
+            delxOM = x[j][0] - x2[0];
+            delyOM = x[j][1] - x2[1];
+            delzOM = x[j][2] - x2[2];
+
+            ddotf = (delxOM * fdx + delyOM * fdy + delzOM * fdz) /
+              (qdist*qdist);
+
+            f1x = alpha * (fdx - ddotf * delxOM);
+            f1y = alpha * (fdy - ddotf * delyOM);
+            f1z = alpha * (fdz - ddotf * delzOM);
+
+            fOx = fdx - f1x;
+            fOy = fdy - f1y;
+            fOz = fdz - f1z;
+
+            fHx = 0.5 * f1x;
+            fHy = 0.5 * f1y;
+            fHz = 0.5 * f1z;
+
+            f[j][0] += fOx;
+            f[j][1] += fOy;
+            f[j][2] += fOz;
+
+            f[jH1][0] += fHx;
+            f[jH1][1] += fHy;
+            f[jH1][2] += fHz;
+
+            f[jH2][0] += fHx;
+            f[jH2][1] += fHy;
+            f[jH2][2] += fHz;
+
+            if (VFLAG) {
+              domain->closest_image(x[j],x[jH1],xH1);
+              domain->closest_image(x[j],x[jH2],xH2);
+
+              v[0] += x[j][0]*fOx + xH1[0]*fHx + xH2[0]*fHx;
+              v[1] += x[j][1]*fOy + xH1[1]*fHy + xH2[1]*fHy;
+              v[2] += x[j][2]*fOz + xH1[2]*fHz + xH2[2]*fHz;
+              v[3] += x[j][0]*fOy + xH1[0]*fHy + xH2[0]*fHy;
+              v[4] += x[j][0]*fOz + xH1[0]*fHz + xH2[0]*fHz;
+              v[5] += x[j][1]*fOz + xH1[1]*fHz + xH2[1]*fHz;
+
+              vlist[n++] = j;
+              vlist[n++] = jH1;
+              vlist[n++] = jH2;
+            }
+          }
+
+          if (EFLAG) {
+            if (CTABLE || rsq <= tabinnersq)
+              ecoul = prefactor*erfc;
+            else {
+              table = etable[itable] + fraction*detable[itable];
+              ecoul = qtmp*q[j] * table;
+            }
+            if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+          } else ecoul = 0.0;
+
+          if (EVFLAG) ev_tally_list_thr(this,n,vlist,ecoul,v,thr);
+        }
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+  compute position xM of fictitious charge site for O atom and 2 H atoms
+  return it as xM
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulPPPMTIP4POMP::compute_newsite_thr(const double * xO,
+                                                    const double * xH1,
+                                                    const double * xH2,
+                                                    double * xM) const
+{
+  double delx1 = xH1[0] - xO[0];
+  double dely1 = xH1[1] - xO[1];
+  double delz1 = xH1[2] - xO[2];
+  domain->minimum_image(delx1,dely1,delz1);
+
+  double delx2 = xH2[0] - xO[0];
+  double dely2 = xH2[1] - xO[1];
+  double delz2 = xH2[2] - xO[2];
+  domain->minimum_image(delx2,dely2,delz2);
+
+  const double prefac = alpha * 0.5;
+  xM[0] = xO[0] + prefac * (delx1 + delx2);
+  xM[1] = xO[1] + prefac * (dely1 + dely2);
+  xM[2] = xO[2] + prefac * (delz1 + delz2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulPPPMTIP4POMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCutCoulLongTIP4P::memory_usage();
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cut_coul_pppm_tip4p_omp.h b/src/USER-OMP/pair_lj_cut_coul_pppm_tip4p_omp.h
new file mode 100644
index 0000000000..b149020925
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_coul_pppm_tip4p_omp.h
@@ -0,0 +1,56 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/pppm/tip4p/omp,PairLJCutCoulPPPMTIP4POMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_PPPM_TIP4P_OMP_H
+#define LMP_PAIR_LJ_CUT_COUL_PPPM_TIP4P_OMP_H
+
+#include "pair_lj_cut_coul_long_tip4p.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulPPPMTIP4POMP : public PairLJCutCoulLongTIP4P, public ThrOMP {
+
+ public:
+  PairLJCutCoulPPPMTIP4POMP(class LAMMPS *);
+  virtual ~PairLJCutCoulPPPMTIP4POMP() {};
+
+  virtual void init_style();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int, int, int, int>
+  void eval(int ifrom, int ito, ThrData * const thr);
+  void compute_newsite_thr(const double *, const double *,
+                           const double *, double *) const;
+
+  class PPPMTIP4PProxy *kspace;
+  int nproxy;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_cut_omp.cpp b/src/USER-OMP/pair_lj_cut_omp.cpp
new file mode 100644
index 0000000000..83e5244add
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_omp.cpp
@@ -0,0 +1,159 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutOMP::PairLJCutOMP(LAMMPS *lmp) :
+  PairLJCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype])
+            - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCut::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_cut_omp.h b/src/USER-OMP/pair_lj_cut_omp.h
new file mode 100644
index 0000000000..419b0693b1
--- /dev/null
+++ b/src/USER-OMP/pair_lj_cut_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/omp,PairLJCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_OMP_H
+#define LMP_PAIR_LJ_CUT_OMP_H
+
+#include "pair_lj_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutOMP : public PairLJCut, public ThrOMP {
+
+ public:
+  PairLJCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_expand_omp.cpp b/src/USER-OMP/pair_lj_expand_omp.cpp
new file mode 100644
index 0000000000..936c55c827
--- /dev/null
+++ b/src/USER-OMP/pair_lj_expand_omp.cpp
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_expand_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJExpandOMP::PairLJExpandOMP(LAMMPS *lmp) :
+  PairLJExpand(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJExpandOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double r,rshift,rshiftsq;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        rshift = r - shift[itype][jtype];
+        rshiftsq = rshift*rshift;
+        r2inv = 1.0/rshiftsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj/rshift/r;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype])
+            - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJExpandOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJExpand::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_expand_omp.h b/src/USER-OMP/pair_lj_expand_omp.h
new file mode 100644
index 0000000000..8ca3bd41b0
--- /dev/null
+++ b/src/USER-OMP/pair_lj_expand_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/expand/omp,PairLJExpandOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_EXPAND_OMP_H
+#define LMP_PAIR_LJ_EXPAND_OMP_H
+
+#include "pair_lj_expand.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJExpandOMP : public PairLJExpand, public ThrOMP {
+
+ public:
+  PairLJExpandOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_gromacs_coul_gromacs_omp.cpp b/src/USER-OMP/pair_lj_gromacs_coul_gromacs_omp.cpp
new file mode 100644
index 0000000000..4df4d334e7
--- /dev/null
+++ b/src/USER-OMP/pair_lj_gromacs_coul_gromacs_omp.cpp
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_gromacs_coul_gromacs_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJGromacsCoulGromacsOMP::PairLJGromacsCoulGromacsOMP(LAMMPS *lmp) :
+  PairLJGromacsCoulGromacs(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsCoulGromacsOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJGromacsCoulGromacsOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double r,tlj,tc,fswitch,fswitchcoul,eswitch,ecoulswitch;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        // skip if qi or qj = 0.0 since this potential may be used as
+        // coarse-grain model with many uncharged atoms
+
+        if (rsq < cut_coulsq && qtmp != 0.0 && q[j] != 0.0) {
+          forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+          if (rsq > cut_coul_innersq) {
+            r = sqrt(rsq);
+            tc = r - cut_coul_inner;
+            fswitchcoul = qqrd2e * qtmp*q[j]*r*tc*tc*(coulsw1 + coulsw2*tc);
+            forcecoul += fswitchcoul;
+          }
+          forcecoul *= factor_coul;
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          jtype = type[j];
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          if (rsq > cut_lj_innersq) {
+            r = sqrt(rsq);
+            tlj = r - cut_lj_inner;
+            fswitch = r*tlj*tlj*(ljsw1[itype][jtype] +
+                                 ljsw2[itype][jtype]*tlj);
+            forcelj += fswitch;
+          }
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_coulsq) {
+            ecoul = qqrd2e * qtmp*q[j] * (sqrt(r2inv) - coulsw5);
+            if (rsq > cut_coul_innersq) {
+              ecoulswitch = tc*tc*tc * (coulsw3 + coulsw4*tc);
+              ecoul += qqrd2e*qtmp*q[j]*ecoulswitch;
+            }
+            ecoul *= factor_coul;
+          } else ecoul = 0.0;
+          if (rsq < cut_ljsq) {
+            evdwl = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+            evdwl += ljsw5[itype][jtype];
+            if (rsq > cut_lj_innersq) {
+              eswitch = tlj*tlj*tlj *
+                (ljsw3[itype][jtype] + ljsw4[itype][jtype]*tlj);
+              evdwl += eswitch;
+            }
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJGromacsCoulGromacsOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJGromacsCoulGromacs::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_gromacs_coul_gromacs_omp.h b/src/USER-OMP/pair_lj_gromacs_coul_gromacs_omp.h
new file mode 100644
index 0000000000..353e4d673d
--- /dev/null
+++ b/src/USER-OMP/pair_lj_gromacs_coul_gromacs_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/gromacs/coul/gromacs/omp,PairLJGromacsCoulGromacsOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_GROMACS_COUL_GROMACS_OMP_H
+#define LMP_PAIR_LJ_GROMACS_COUL_GROMACS_OMP_H
+
+#include "pair_lj_gromacs_coul_gromacs.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJGromacsCoulGromacsOMP : public PairLJGromacsCoulGromacs, public ThrOMP {
+
+ public:
+  PairLJGromacsCoulGromacsOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_gromacs_omp.cpp b/src/USER-OMP/pair_lj_gromacs_omp.cpp
new file mode 100644
index 0000000000..ce97ad2d1d
--- /dev/null
+++ b/src/USER-OMP/pair_lj_gromacs_omp.cpp
@@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_gromacs_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJGromacsOMP::PairLJGromacsOMP(LAMMPS *lmp) :
+  PairLJGromacs(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJGromacsOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJGromacsOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double r,t,fswitch,eswitch;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        if (rsq > cut_inner_sq[itype][jtype]) {
+          r = sqrt(rsq);
+          t = r - cut_inner[itype][jtype];
+          fswitch = r*t*t*(ljsw1[itype][jtype] + ljsw2[itype][jtype]*t);
+          forcelj += fswitch;
+        }
+
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+          evdwl += ljsw5[itype][jtype];
+          if (rsq > cut_inner_sq[itype][jtype]) {
+            eswitch = t*t*t*(ljsw3[itype][jtype] + ljsw4[itype][jtype]*t);
+            evdwl += eswitch;
+          }
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJGromacsOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJGromacs::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_gromacs_omp.h b/src/USER-OMP/pair_lj_gromacs_omp.h
new file mode 100644
index 0000000000..3669d1b219
--- /dev/null
+++ b/src/USER-OMP/pair_lj_gromacs_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/gromacs/omp,PairLJGromacsOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_GROMACS_OMP_H
+#define LMP_PAIR_LJ_GROMACS_OMP_H
+
+#include "pair_lj_gromacs.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJGromacsOMP : public PairLJGromacs, public ThrOMP {
+
+ public:
+  PairLJGromacsOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_sdk_coul_long_omp.cpp b/src/USER-OMP/pair_lj_sdk_coul_long_omp.cpp
new file mode 100644
index 0000000000..aa4e432a78
--- /dev/null
+++ b/src/USER-OMP/pair_lj_sdk_coul_long_omp.cpp
@@ -0,0 +1,235 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_sdk_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "lj_sdk_common.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace LJSDKParms;
+/* ---------------------------------------------------------------------- */
+
+PairLJSDKCoulLongOMP::PairLJSDKCoulLongOMP(LAMMPS *lmp) :
+  PairLJSDKCoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSDKCoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval_thr<1,1,1>(ifrom, ito, thr);
+        else eval_thr<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval_thr<1,0,1>(ifrom, ito, thr);
+        else eval_thr<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval_thr<0,0,1>(ifrom, ito, thr);
+      else eval_thr<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJSDKCoulLongOMP::eval_thr(int iifrom, int iito, ThrData * const thr)
+{
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const q = atom->q;
+  const int * const type = atom->type;
+  const double * const special_coul = force->special_coul;
+  const double * const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+
+  const int * const ilist = list->ilist;
+  const int * const numneigh = list->numneigh;
+  const int * const * const firstneigh = list->firstneigh;
+  const int nlocal = atom->nlocal;
+
+  // loop over neighbors of my atoms
+
+  for (int ii = iifrom; ii < iito; ++ii) {
+
+    const int i = ilist[ii];
+    const int itype = type[i];
+    const double qtmp = q[i];
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    double fxtmp,fytmp,fztmp;
+    fxtmp=fytmp=fztmp=0.0;
+
+    const int * const jlist = firstneigh[i];
+    const int jnum = numneigh[i];
+
+    for (int jj = 0; jj < jnum; jj++) {
+      double forcecoul, forcelj, evdwl, ecoul;
+      forcecoul = forcelj = evdwl = ecoul = 0.0;
+
+      const int sbindex = sbmask(jlist[jj]);
+      const int j = jlist[jj] & NEIGHMASK;
+
+      const double delx = xtmp - x[j][0];
+      const double dely = ytmp - x[j][1];
+      const double delz = ztmp - x[j][2];
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      const int jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        const double r2inv = 1.0/rsq;
+        const int ljt = lj_type[itype][jtype];
+
+        if (rsq < cut_coulsq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            const double A1 =  0.254829592;
+            const double A2 = -0.284496736;
+            const double A3 =  1.421413741;
+            const double A4 = -1.453152027;
+            const double A5 =  1.061405429;
+            const double EWALD_F = 1.12837917;
+            const double INV_EWALD_P = 1.0/0.3275911;
+
+            const double r = sqrt(rsq);
+            const double grij = g_ewald * r;
+            const double expm2 = exp(-grij*grij);
+            const double t = INV_EWALD_P / (INV_EWALD_P + grij);
+            const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            const double prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (EFLAG) ecoul = prefactor*erfc;
+            if (sbindex) {
+              const double adjust = (1.0-special_coul[sbindex])*prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            const int itable = (rsq_lookup.i & ncoulmask) >> ncoulshiftbits;
+            const double fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            const double table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (EFLAG) ecoul = qtmp*q[j] * (etable[itable] + fraction*detable[itable]);
+            if (sbindex) {
+              const double table2 = ctable[itable] + fraction*dctable[itable];
+              const double prefactor = qtmp*q[j] * table2;
+              const double adjust = (1.0-special_coul[sbindex])*prefactor;
+              forcecoul -= adjust;
+              if (EFLAG) ecoul -= adjust;
+            }
+          }
+        }
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+
+          if (ljt == LJ12_4) {
+            const double r4inv=r2inv*r2inv;
+            forcelj = r4inv*(lj1[itype][jtype]*r4inv*r4inv
+                             - lj2[itype][jtype]);
+
+            if (EFLAG)
+              evdwl = r4inv*(lj3[itype][jtype]*r4inv*r4inv
+                             - lj4[itype][jtype]) - offset[itype][jtype];
+
+          } else if (ljt == LJ9_6) {
+            const double r3inv = r2inv*sqrt(r2inv);
+            const double r6inv = r3inv*r3inv;
+            forcelj = r6inv*(lj1[itype][jtype]*r3inv
+                             - lj2[itype][jtype]);
+            if (EFLAG)
+              evdwl = r6inv*(lj3[itype][jtype]*r3inv
+                             - lj4[itype][jtype]) - offset[itype][jtype];
+
+          } else if (ljt == LJ12_6) {
+            const double r6inv = r2inv*r2inv*r2inv;
+            forcelj = r6inv*(lj1[itype][jtype]*r6inv
+                             - lj2[itype][jtype]);
+            if (EFLAG)
+              evdwl = r6inv*(lj3[itype][jtype]*r6inv
+                             - lj4[itype][jtype]) - offset[itype][jtype];
+          }
+
+          if (sbindex) {
+            const double factor_lj = special_lj[sbindex];
+            forcelj *= factor_lj;
+            if (EFLAG) evdwl *= factor_lj;
+          }
+
+        }
+
+        const double fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJSDKCoulLongOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJSDKCoulLong::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_sdk_coul_long_omp.h b/src/USER-OMP/pair_lj_sdk_coul_long_omp.h
new file mode 100644
index 0000000000..a615efb507
--- /dev/null
+++ b/src/USER-OMP/pair_lj_sdk_coul_long_omp.h
@@ -0,0 +1,49 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/sdk/coul/long/omp,PairLJSDKCoulLongOMP)
+PairStyle(cg/cmm/coul/long/omp,PairLJSDKCoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_SDK_COUL_LONG_OMP_H
+#define LMP_PAIR_LJ_SDK_COUL_LONG_OMP_H
+
+#include "pair_lj_sdk_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJSDKCoulLongOMP : public PairLJSDKCoulLong, public ThrOMP {
+
+ public:
+  PairLJSDKCoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval_thr(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_sdk_omp.cpp b/src/USER-OMP/pair_lj_sdk_omp.cpp
new file mode 100644
index 0000000000..63b1a8c171
--- /dev/null
+++ b/src/USER-OMP/pair_lj_sdk_omp.cpp
@@ -0,0 +1,186 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+   This style is a simplified re-implementation of the CG/CMM pair style
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_sdk_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "lj_sdk_common.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace LJSDKParms;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJSDKOMP::PairLJSDKOMP(LAMMPS *lmp) :
+  PairLJSDK(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSDKOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval_thr<1,1,1>(ifrom, ito, thr);
+        else eval_thr<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval_thr<1,0,1>(ifrom, ito, thr);
+        else eval_thr<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval_thr<0,0,1>(ifrom, ito, thr);
+      else eval_thr<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJSDKOMP::eval_thr(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,forcelj,factor_lj;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  const int * const ilist = list->ilist;
+  const int * const numneigh = list->numneigh;
+  const int * const * const firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    fxtmp=fytmp=fztmp=0.0;
+
+    const int itype = type[i];
+    const int * const jlist = firstneigh[i];
+    const int jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        const int ljt = lj_type[itype][jtype];
+
+        if (ljt == LJ12_4) {
+          const double r4inv=r2inv*r2inv;
+          forcelj = r4inv*(lj1[itype][jtype]*r4inv*r4inv
+                           - lj2[itype][jtype]);
+
+          if (EFLAG)
+            evdwl = r4inv*(lj3[itype][jtype]*r4inv*r4inv
+                           - lj4[itype][jtype]) - offset[itype][jtype];
+
+        } else if (ljt == LJ9_6) {
+          const double r3inv = r2inv*sqrt(r2inv);
+          const double r6inv = r3inv*r3inv;
+          forcelj = r6inv*(lj1[itype][jtype]*r3inv
+                           - lj2[itype][jtype]);
+          if (EFLAG)
+            evdwl = r6inv*(lj3[itype][jtype]*r3inv
+                           - lj4[itype][jtype]) - offset[itype][jtype];
+
+        } else if (ljt == LJ12_6) {
+          const double r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv*(lj1[itype][jtype]*r6inv
+                          - lj2[itype][jtype]);
+          if (EFLAG)
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv
+                           - lj4[itype][jtype]) - offset[itype][jtype];
+        } else continue;
+
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) evdwl *= factor_lj;
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJSDKOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJSDK::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_sdk_omp.h b/src/USER-OMP/pair_lj_sdk_omp.h
new file mode 100644
index 0000000000..c3837fb683
--- /dev/null
+++ b/src/USER-OMP/pair_lj_sdk_omp.h
@@ -0,0 +1,49 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/sdk/omp,PairLJSDKOMP)
+PairStyle(cg/cmm/omp,PairLJSDKOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_SDK_OMP_H
+#define LMP_PAIR_LJ_SDK_OMP_H
+
+#include "pair_lj_sdk.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJSDKOMP : public PairLJSDK, public ThrOMP {
+
+ public:
+  PairLJSDKOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval_thr(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_sf_omp.cpp b/src/USER-OMP/pair_lj_sf_omp.cpp
new file mode 100644
index 0000000000..5c14e29131
--- /dev/null
+++ b/src/USER-OMP/pair_lj_sf_omp.cpp
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_sf_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJShiftedForceOMP::PairLJShiftedForceOMP(LAMMPS *lmp) :
+  PairLJShiftedForce(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJShiftedForceOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJShiftedForceOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double t,rsq,r2inv,r6inv,forcelj,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        t = sqrt(r2inv*cutsq[itype][jtype]);
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]) -
+          t*foffset[itype][jtype];
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) +
+            (t-1.0)*foffset[itype][jtype] - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJShiftedForceOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJShiftedForce::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_sf_omp.h b/src/USER-OMP/pair_lj_sf_omp.h
new file mode 100644
index 0000000000..92db973b3d
--- /dev/null
+++ b/src/USER-OMP/pair_lj_sf_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/sf/omp,PairLJShiftedForceOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_SF_OMP_H
+#define LMP_PAIR_LJ_SF_OMP_H
+
+#include "pair_lj_sf.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJShiftedForceOMP : public PairLJShiftedForce, public ThrOMP {
+
+ public:
+  PairLJShiftedForceOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_smooth_linear_omp.cpp b/src/USER-OMP/pair_lj_smooth_linear_omp.cpp
new file mode 100644
index 0000000000..a8f3ad8daa
--- /dev/null
+++ b/src/USER-OMP/pair_lj_smooth_linear_omp.cpp
@@ -0,0 +1,163 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_smooth_linear_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJSmoothLinearOMP::PairLJSmoothLinearOMP(LAMMPS *lmp) :
+  PairLJSmoothLinear(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSmoothLinearOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJSmoothLinearOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double r,rinv;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        rinv  = sqrt(r2inv);
+        forcelj = r6inv*(lj1[itype][jtype]*r6inv-lj2[itype][jtype]);
+        forcelj = rinv*forcelj - dljcut[itype][jtype];
+
+        fpair = factor_lj*forcelj*rinv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          r = sqrt(rsq);
+          evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+          evdwl = evdwl - ljcut[itype][jtype]
+                          + (r-cut[itype][jtype])*dljcut[itype][jtype];
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJSmoothLinearOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJSmoothLinear::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_smooth_linear_omp.h b/src/USER-OMP/pair_lj_smooth_linear_omp.h
new file mode 100644
index 0000000000..940c0ea707
--- /dev/null
+++ b/src/USER-OMP/pair_lj_smooth_linear_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/smooth/linear/omp,PairLJSmoothLinearOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_SMOOTH_LINEAR_OMP_H
+#define LMP_PAIR_LJ_SMOOTH_LINEAR_OMP_H
+
+#include "pair_lj_smooth_linear.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJSmoothLinearOMP : public PairLJSmoothLinear, public ThrOMP {
+
+ public:
+  PairLJSmoothLinearOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lj_smooth_omp.cpp b/src/USER-OMP/pair_lj_smooth_omp.cpp
new file mode 100644
index 0000000000..b1918c3fcd
--- /dev/null
+++ b/src/USER-OMP/pair_lj_smooth_omp.cpp
@@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_smooth_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJSmoothOMP::PairLJSmoothOMP(LAMMPS *lmp) :
+  PairLJSmooth(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJSmoothOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJSmoothOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double r,t,tsq,fskin;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        if (rsq < cut_inner_sq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv-lj2[itype][jtype]);
+        } else {
+          r = sqrt(rsq);
+          t = r - cut_inner[itype][jtype];
+          tsq = t*t;
+          fskin = ljsw1[itype][jtype] + ljsw2[itype][jtype]*t +
+            ljsw3[itype][jtype]*tsq + ljsw4[itype][jtype]*tsq*t;
+          forcelj = fskin*r;
+        }
+
+        fpair = factor_lj*forcelj*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_inner_sq[itype][jtype])
+            evdwl = r6inv * (lj3[itype][jtype]*r6inv -
+                             lj4[itype][jtype]) - offset[itype][jtype];
+          else
+            evdwl = ljsw0[itype][jtype] - ljsw1[itype][jtype]*t -
+              ljsw2[itype][jtype]*tsq/2.0 - ljsw3[itype][jtype]*tsq*t/3.0 -
+              ljsw4[itype][jtype]*tsq*tsq/4.0 - offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJSmoothOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJSmooth::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lj_smooth_omp.h b/src/USER-OMP/pair_lj_smooth_omp.h
new file mode 100644
index 0000000000..0b47a792a4
--- /dev/null
+++ b/src/USER-OMP/pair_lj_smooth_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/smooth/omp,PairLJSmoothOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_SMOOTH_OMP_H
+#define LMP_PAIR_LJ_SMOOTH_OMP_H
+
+#include "pair_lj_smooth.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJSmoothOMP : public PairLJSmooth, public ThrOMP {
+
+ public:
+  PairLJSmoothOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lubricate_omp.cpp b/src/USER-OMP/pair_lubricate_omp.cpp
new file mode 100644
index 0000000000..22090b2978
--- /dev/null
+++ b/src/USER-OMP/pair_lubricate_omp.cpp
@@ -0,0 +1,484 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lubricate_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "input.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+#include "variable.h"
+#include "random_mars.h"
+#include "fix_wall.h"
+#include "fix_deform.h"
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+// same as fix_deform.cpp
+
+enum{NO_REMAP,X_REMAP,V_REMAP};
+
+// same as fix_wall.cpp
+
+enum{EDGE,CONSTANT,VARIABLE};
+
+/* ---------------------------------------------------------------------- */
+
+PairLubricateOMP::PairLubricateOMP(LAMMPS *lmp) :
+  PairLubricate(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLubricateOMP::~PairLubricateOMP()
+{}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLubricateOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+
+  // This section of code adjusts R0/RT0/RS0 if necessary due to changes
+  // in the volume fraction as a result of fix deform or moving walls
+
+  double dims[3], wallcoord;
+  if (flagVF) // Flag for volume fraction corrections
+    if (flagdeform || flagwall == 2){ // Possible changes in volume fraction
+      if (flagdeform && !flagwall)
+        for (int j = 0; j < 3; j++)
+          dims[j] = domain->prd[j];
+      else if (flagwall == 2 || (flagdeform && flagwall == 1)){
+         double wallhi[3], walllo[3];
+         for (int j = 0; j < 3; j++){
+           wallhi[j] = domain->prd[j];
+           walllo[j] = 0;
+         }
+         for (int m = 0; m < wallfix->nwall; m++){
+           int dim = wallfix->wallwhich[m] / 2;
+           int side = wallfix->wallwhich[m] % 2;
+           if (wallfix->wallstyle[m] == VARIABLE){
+             wallcoord = input->variable->compute_equal(wallfix->varindex[m]);
+           }
+           else wallcoord = wallfix->coord0[m];
+           if (side == 0) walllo[dim] = wallcoord;
+           else wallhi[dim] = wallcoord;
+         }
+         for (int j = 0; j < 3; j++)
+           dims[j] = wallhi[j] - walllo[j];
+      }
+      double vol_T = dims[0]*dims[1]*dims[2];
+      double vol_f = vol_P/vol_T;
+      if (flaglog == 0) {
+        R0  = 6*MY_PI*mu*rad*(1.0 + 2.16*vol_f);
+        RT0 = 8*MY_PI*mu*pow(rad,3.0);
+        RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3.0)*
+          (1.0 + 3.33*vol_f + 2.80*vol_f*vol_f);
+      } else {
+        R0  = 6*MY_PI*mu*rad*(1.0 + 2.725*vol_f - 6.583*vol_f*vol_f);
+        RT0 = 8*MY_PI*mu*pow(rad,3.0)*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
+        RS0 = 20.0/3.0*MY_PI*mu*pow(rad,3.0)*
+          (1.0 + 3.64*vol_f - 6.95*vol_f*vol_f);
+      }
+    }
+
+
+  // end of R0 adjustment code
+
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (flaglog) {
+      if (evflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (evflag) {
+        if (force->newton_pair) eval<0,1,1>(ifrom, ito, thr);
+        else eval<0,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+        else eval<0,0,0>(ifrom, ito, thr);
+      }
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int FLAGLOG, int EVFLAG, int NEWTON_PAIR>
+void PairLubricateOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz,tx,ty,tz;
+  double rsq,r,h_sep,radi;
+  double vr1,vr2,vr3,vnnr,vn1,vn2,vn3;
+  double vt1,vt2,vt3,wt1,wt2,wt3,wdotn;
+  double vRS0;
+  double vi[3],vj[3],wi[3],wj[3],xl[3];
+  double a_sq,a_sh,a_pu;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double lamda[3],vstream[3];
+
+  double vxmu2f = force->vxmu2f;
+
+  double * const * const x = atom->x;
+  double * const * const v = atom->v;
+  double * const * const f = thr->get_f();
+  double * const * const omega = atom->omega;
+  double * const * const torque = thr->get_torque();
+  const double * const radius = atom->radius;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  int overlaps = 0;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // subtract streaming component of velocity, omega, angmom
+  // assume fluid streaming velocity = box deformation rate
+  // vstream = (ux,uy,uz)
+  // ux = h_rate[0]*x + h_rate[5]*y + h_rate[4]*z
+  // uy = h_rate[1]*y + h_rate[3]*z
+  // uz = h_rate[2]*z
+  // omega_new = omega - curl(vstream)/2
+  // angmom_new = angmom - I*curl(vstream)/2
+  // Ef = (grad(vstream) + (grad(vstream))^T) / 2
+
+  if (shearing) {
+    double *h_rate = domain->h_rate;
+    double *h_ratelo = domain->h_ratelo;
+
+    for (ii = iifrom; ii < iito; ii++) {
+      i = ilist[ii];
+      itype = type[i];
+      radi = radius[i];
+
+      domain->x2lamda(x[i],lamda);
+      vstream[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
+        h_rate[4]*lamda[2] + h_ratelo[0];
+      vstream[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
+      vstream[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+      v[i][0] -= vstream[0];
+      v[i][1] -= vstream[1];
+      v[i][2] -= vstream[2];
+
+      omega[i][0] += 0.5*h_rate[3];
+      omega[i][1] -= 0.5*h_rate[4];
+      omega[i][2] += 0.5*h_rate[5];
+    }
+
+    // set Ef from h_rate in strain units
+
+    Ef[0][0] = h_rate[0]/domain->xprd;
+    Ef[1][1] = h_rate[1]/domain->yprd;
+    Ef[2][2] = h_rate[2]/domain->zprd;
+    Ef[0][1] = Ef[1][0] = 0.5 * h_rate[5]/domain->yprd;
+    Ef[0][2] = Ef[2][0] = 0.5 * h_rate[4]/domain->zprd;
+    Ef[1][2] = Ef[2][1] = 0.5 * h_rate[3]/domain->zprd;
+
+    // copy updated velocity/omega/angmom to the ghost particles
+    // no need to do this if not shearing since comm->ghost_velocity is set
+
+    sync_threads();
+
+    // MPI communication only on master thread
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    { comm->forward_comm_pair(this); }
+
+    sync_threads();
+  }
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    radi = radius[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    // angular velocity
+
+    wi[0] = omega[i][0];
+    wi[1] = omega[i][1];
+    wi[2] = omega[i][2];
+
+    // FLD contribution to force and torque due to isotropic terms
+    // FLD contribution to stress from isotropic RS0
+
+    if (flagfld) {
+      f[i][0] -= vxmu2f*R0*v[i][0];
+      f[i][1] -= vxmu2f*R0*v[i][1];
+      f[i][2] -= vxmu2f*R0*v[i][2];
+      torque[i][0] -= vxmu2f*RT0*wi[0];
+      torque[i][1] -= vxmu2f*RT0*wi[1];
+      torque[i][2] -= vxmu2f*RT0*wi[2];
+
+      if (shearing && vflag_either) {
+        vRS0 = -vxmu2f * RS0;
+        v_tally_tensor(i,i,nlocal,NEWTON_PAIR,
+                       vRS0*Ef[0][0],vRS0*Ef[1][1],vRS0*Ef[2][2],
+                       vRS0*Ef[0][1],vRS0*Ef[0][2],vRS0*Ef[1][2]);
+      }
+    }
+
+    if (!flagHI) continue;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+
+        // angular momentum = I*omega = 2/5 * M*R^2 * omega
+
+        wj[0] = omega[j][0];
+        wj[1] = omega[j][1];
+        wj[2] = omega[j][2];
+
+        // xl = point of closest approach on particle i from its center
+
+        xl[0] = -delx/r*radi;
+        xl[1] = -dely/r*radi;
+        xl[2] = -delz/r*radi;
+
+        // velocity at the point of closest approach on both particles
+        // v = v + omega_cross_xl - Ef.xl
+
+        // particle i
+
+        vi[0] = v[i][0] + (wi[1]*xl[2] - wi[2]*xl[1])
+                        - (Ef[0][0]*xl[0] + Ef[0][1]*xl[1] + Ef[0][2]*xl[2]);
+
+        vi[1] = v[i][1] + (wi[2]*xl[0] - wi[0]*xl[2])
+                        - (Ef[1][0]*xl[0] + Ef[1][1]*xl[1] + Ef[1][2]*xl[2]);
+
+        vi[2] = v[i][2] + (wi[0]*xl[1] - wi[1]*xl[0])
+                        - (Ef[2][0]*xl[0] + Ef[2][1]*xl[1] + Ef[2][2]*xl[2]);
+
+        // particle j
+
+        vj[0] = v[j][0] - (wj[1]*xl[2] - wj[2]*xl[1])
+                        + (Ef[0][0]*xl[0] + Ef[0][1]*xl[1] + Ef[0][2]*xl[2]);
+
+        vj[1] = v[j][1] - (wj[2]*xl[0] - wj[0]*xl[2])
+                        + (Ef[1][0]*xl[0] + Ef[1][1]*xl[1] + Ef[1][2]*xl[2]);
+
+        vj[2] = v[j][2] - (wj[0]*xl[1] - wj[1]*xl[0])
+                        + (Ef[2][0]*xl[0] + Ef[2][1]*xl[1] + Ef[2][2]*xl[2]);
+
+        // scalar resistances XA and YA
+
+        h_sep = r - 2.0*radi;
+
+        // check for overlaps
+
+        if (h_sep < 0.0) overlaps++;
+
+        // if less than the minimum gap use the minimum gap instead
+
+        if (r < cut_inner[itype][jtype])
+          h_sep = cut_inner[itype][jtype] - 2.0*radi;
+
+        // scale h_sep by radi
+
+        h_sep = h_sep/radi;
+
+        // scalar resistances
+
+        if (FLAGLOG) {
+          a_sq = 6.0*MY_PI*mu*radi*(1.0/4.0/h_sep + 9.0/40.0*log(1.0/h_sep));
+          a_sh = 6.0*MY_PI*mu*radi*(1.0/6.0*log(1.0/h_sep));
+          a_pu = 8.0*MY_PI*mu*pow(radi,3.0)*(3.0/160.0*log(1.0/h_sep));
+        } else
+          a_sq = 6.0*MY_PI*mu*radi*(1.0/4.0/h_sep);
+
+        // relative velocity at the point of closest approach
+        // includes fluid velocity
+
+        vr1 = vi[0] - vj[0];
+        vr2 = vi[1] - vj[1];
+        vr3 = vi[2] - vj[2];
+
+        // normal component (vr.n)n
+
+        vnnr = (vr1*delx + vr2*dely + vr3*delz)/r;
+        vn1 = vnnr*delx/r;
+        vn2 = vnnr*dely/r;
+        vn3 = vnnr*delz/r;
+
+        // tangential component vr - (vr.n)n
+
+        vt1 = vr1 - vn1;
+        vt2 = vr2 - vn2;
+        vt3 = vr3 - vn3;
+
+        // force due to squeeze type motion
+
+        fx  = a_sq*vn1;
+        fy  = a_sq*vn2;
+        fz  = a_sq*vn3;
+
+        // force due to all shear kind of motions
+
+        if (FLAGLOG) {
+          fx = fx + a_sh*vt1;
+          fy = fy + a_sh*vt2;
+          fz = fz + a_sh*vt3;
+        }
+
+        // scale forces for appropriate units
+
+        fx *= vxmu2f;
+        fy *= vxmu2f;
+        fz *= vxmu2f;
+
+        // add to total force
+
+        f[i][0] -= fx;
+        f[i][1] -= fy;
+        f[i][2] -= fz;
+
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] += fx;
+          f[j][1] += fy;
+          f[j][2] += fz;
+        }
+
+        // torque due to this force
+
+        if (FLAGLOG) {
+          tx = xl[1]*fz - xl[2]*fy;
+          ty = xl[2]*fx - xl[0]*fz;
+          tz = xl[0]*fy - xl[1]*fx;
+
+          torque[i][0] -= vxmu2f*tx;
+          torque[i][1] -= vxmu2f*ty;
+          torque[i][2] -= vxmu2f*tz;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            torque[j][0] -= vxmu2f*tx;
+            torque[j][1] -= vxmu2f*ty;
+            torque[j][2] -= vxmu2f*tz;
+          }
+
+          // torque due to a_pu
+
+          wdotn = ((wi[0]-wj[0])*delx + (wi[1]-wj[1])*dely +
+                   (wi[2]-wj[2])*delz)/r;
+          wt1 = (wi[0]-wj[0]) - wdotn*delx/r;
+          wt2 = (wi[1]-wj[1]) - wdotn*dely/r;
+          wt3 = (wi[2]-wj[2]) - wdotn*delz/r;
+
+          tx = a_pu*wt1;
+          ty = a_pu*wt2;
+          tz = a_pu*wt3;
+
+          torque[i][0] -= vxmu2f*tx;
+          torque[i][1] -= vxmu2f*ty;
+          torque[i][2] -= vxmu2f*tz;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            torque[j][0] += vxmu2f*tx;
+            torque[j][1] += vxmu2f*ty;
+            torque[j][2] += vxmu2f*tz;
+          }
+        }
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,0.0,0.0,
+                                     -fx,-fy,-fz,delx,dely,delz,thr);
+      }
+    }
+  }
+
+  // restore streaming component of velocity, omega, angmom
+
+  if (shearing) {
+    double *h_rate = domain->h_rate;
+    double *h_ratelo = domain->h_ratelo;
+
+    for (ii = iifrom; ii < iito; ii++) {
+      i = ilist[ii];
+      itype = type[i];
+      radi = radius[i];
+
+      domain->x2lamda(x[i],lamda);
+      vstream[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
+        h_rate[4]*lamda[2] + h_ratelo[0];
+      vstream[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
+      vstream[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+      v[i][0] += vstream[0];
+      v[i][1] += vstream[1];
+      v[i][2] += vstream[2];
+
+      omega[i][0] -= 0.5*h_rate[3];
+      omega[i][1] += 0.5*h_rate[4];
+      omega[i][2] -= 0.5*h_rate[5];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLubricateOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLubricate::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lubricate_omp.h b/src/USER-OMP/pair_lubricate_omp.h
new file mode 100644
index 0000000000..b35dc5d563
--- /dev/null
+++ b/src/USER-OMP/pair_lubricate_omp.h
@@ -0,0 +1,49 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lubricate/omp,PairLubricateOMP)
+
+#else
+
+#ifndef LMP_PAIR_LUBRICATE_OMP_H
+#define LMP_PAIR_LUBRICATE_OMP_H
+
+#include "pair_lubricate.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLubricateOMP : public PairLubricate, public ThrOMP {
+
+ public:
+  PairLubricateOMP(class LAMMPS *);
+  virtual ~PairLubricateOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int LOGFLAG, int EVFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_lubricate_poly_omp.cpp b/src/USER-OMP/pair_lubricate_poly_omp.cpp
new file mode 100644
index 0000000000..26339b7749
--- /dev/null
+++ b/src/USER-OMP/pair_lubricate_poly_omp.cpp
@@ -0,0 +1,487 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lubricate_poly_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "input.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "update.h"
+#include "variable.h"
+#include "random_mars.h"
+#include "fix_wall.h"
+#include "fix_deform.h"
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+// same as fix_deform.cpp
+
+enum{NO_REMAP,X_REMAP,V_REMAP};
+
+
+// same as fix_wall.cpp
+
+enum{EDGE,CONSTANT,VARIABLE};
+
+/* ---------------------------------------------------------------------- */
+
+PairLubricatePolyOMP::PairLubricatePolyOMP(LAMMPS *lmp) :
+  PairLubricatePoly(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLubricatePolyOMP::~PairLubricatePolyOMP()
+{}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLubricatePolyOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+
+  // This section of code adjusts R0/RT0/RS0 if necessary due to changes
+  // in the volume fraction as a result of fix deform or moving walls
+
+  double dims[3], wallcoord;
+  if (flagVF) // Flag for volume fraction corrections
+    if (flagdeform || flagwall == 2){ // Possible changes in volume fraction
+      if (flagdeform && !flagwall)
+        for (int j = 0; j < 3; j++)
+          dims[j] = domain->prd[j];
+      else if (flagwall == 2 || (flagdeform && flagwall == 1)){
+         double wallhi[3], walllo[3];
+         for (int j = 0; j < 3; j++){
+           wallhi[j] = domain->prd[j];
+           walllo[j] = 0;
+         }
+         for (int m = 0; m < wallfix->nwall; m++){
+           int dim = wallfix->wallwhich[m] / 2;
+           int side = wallfix->wallwhich[m] % 2;
+           if (wallfix->wallstyle[m] == VARIABLE){
+             wallcoord = input->variable->compute_equal(wallfix->varindex[m]);
+           }
+           else wallcoord = wallfix->coord0[m];
+           if (side == 0) walllo[dim] = wallcoord;
+           else wallhi[dim] = wallcoord;
+         }
+         for (int j = 0; j < 3; j++)
+           dims[j] = wallhi[j] - walllo[j];
+      }
+      double vol_T = dims[0]*dims[1]*dims[2];
+      double vol_f = vol_P/vol_T;
+      if (flaglog == 0) {
+        R0  = 6*MY_PI*mu*(1.0 + 2.16*vol_f);
+        RT0 = 8*MY_PI*mu;
+        RS0 = 20.0/3.0*MY_PI*mu*(1.0 + 3.33*vol_f + 2.80*vol_f*vol_f);
+      } else {
+        R0  = 6*MY_PI*mu*(1.0 + 2.725*vol_f - 6.583*vol_f*vol_f);
+        RT0 = 8*MY_PI*mu*(1.0 + 0.749*vol_f - 2.469*vol_f*vol_f);
+        RS0 = 20.0/3.0*MY_PI*mu*(1.0 + 3.64*vol_f - 6.95*vol_f*vol_f);
+      }
+    }
+
+  // end of R0 adjustment code
+
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (flaglog) {
+      if (shearing) {
+        if (evflag)
+          eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (evflag)
+          eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (shearing) {
+        if (evflag)
+          eval<0,1,1>(ifrom, ito, thr);
+        else eval<0,1,0>(ifrom, ito, thr);
+      } else {
+        if (evflag)
+          eval<0,0,1>(ifrom, ito, thr);
+        else eval<0,0,0>(ifrom, ito, thr);
+      }
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int FLAGLOG, int SHEARING, int EVFLAG>
+void PairLubricatePolyOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fx,fy,fz,tx,ty,tz;
+  double rsq,r,h_sep,beta0,beta1,radi,radj;
+  double vr1,vr2,vr3,vnnr,vn1,vn2,vn3;
+  double vt1,vt2,vt3,wt1,wt2,wt3,wdotn;
+  double vRS0;
+  double vi[3],vj[3],wi[3],wj[3],xl[3],jl[3];
+  double a_sq,a_sh,a_pu;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double lamda[3],vstream[3];
+
+  double vxmu2f = force->vxmu2f;
+
+  double * const * const x = atom->x;
+  double * const * const v = atom->v;
+  double * const * const f = thr->get_f();
+  double * const * const omega = atom->omega;
+  double * const * const torque = thr->get_torque();
+  const double * const radius = atom->radius;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  int overlaps = 0;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // subtract streaming component of velocity, omega, angmom
+  // assume fluid streaming velocity = box deformation rate
+  // vstream = (ux,uy,uz)
+  // ux = h_rate[0]*x + h_rate[5]*y + h_rate[4]*z
+  // uy = h_rate[1]*y + h_rate[3]*z
+  // uz = h_rate[2]*z
+  // omega_new = omega - curl(vstream)/2
+  // angmom_new = angmom - I*curl(vstream)/2
+  // Ef = (grad(vstream) + (grad(vstream))^T) / 2
+
+  if (shearing) {
+    double *h_rate = domain->h_rate;
+    double *h_ratelo = domain->h_ratelo;
+
+    for (ii = iifrom; ii < iito; ii++) {
+      i = ilist[ii];
+      itype = type[i];
+      radi = radius[i];
+      domain->x2lamda(x[i],lamda);
+      vstream[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
+        h_rate[4]*lamda[2] + h_ratelo[0];
+      vstream[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
+      vstream[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+      v[i][0] -= vstream[0];
+      v[i][1] -= vstream[1];
+      v[i][2] -= vstream[2];
+
+      omega[i][0] += 0.5*h_rate[3];
+      omega[i][1] -= 0.5*h_rate[4];
+      omega[i][2] += 0.5*h_rate[5];
+    }
+
+    // set Ef from h_rate in strain units
+
+    Ef[0][0] = h_rate[0]/domain->xprd;
+    Ef[1][1] = h_rate[1]/domain->yprd;
+    Ef[2][2] = h_rate[2]/domain->zprd;
+    Ef[0][1] = Ef[1][0] = 0.5 * h_rate[5]/domain->yprd;
+    Ef[0][2] = Ef[2][0] = 0.5 * h_rate[4]/domain->zprd;
+    Ef[1][2] = Ef[2][1] = 0.5 * h_rate[3]/domain->zprd;
+
+    // copy updated omega to the ghost particles
+    // no need to do this if not shearing since comm->ghost_velocity is set
+
+    sync_threads();
+
+    // MPI communication only on master thread
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+    { comm->forward_comm_pair(this); }
+
+    sync_threads();
+  }
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    radi = radius[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    // angular velocity
+
+    wi[0] = omega[i][0];
+    wi[1] = omega[i][1];
+    wi[2] = omega[i][2];
+
+    // FLD contribution to force and torque due to isotropic terms
+    // FLD contribution to stress from isotropic RS0
+
+    if (flagfld) {
+      f[i][0] -= vxmu2f*R0*radi*v[i][0];
+      f[i][1] -= vxmu2f*R0*radi*v[i][1];
+      f[i][2] -= vxmu2f*R0*radi*v[i][2];
+      const double rad3 = radi*radi*radi;
+      torque[i][0] -= vxmu2f*RT0*rad3*wi[0];
+      torque[i][1] -= vxmu2f*RT0*rad3*wi[1];
+      torque[i][2] -= vxmu2f*RT0*rad3*wi[2];
+
+      if (SHEARING && vflag_either) {
+        vRS0 = -vxmu2f * RS0*rad3;
+        v_tally_tensor(i,i,nlocal,/* newton_pair */ 0,
+                       vRS0*Ef[0][0],vRS0*Ef[1][1],vRS0*Ef[2][2],
+                       vRS0*Ef[0][1],vRS0*Ef[0][2],vRS0*Ef[1][2]);
+      }
+    }
+
+    if (!flagHI) continue;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+      radj = atom->radius[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+
+        // angular momentum = I*omega = 2/5 * M*R^2 * omega
+
+        wj[0] = omega[j][0];
+        wj[1] = omega[j][1];
+        wj[2] = omega[j][2];
+
+        // xl = point of closest approach on particle i from its center
+
+        xl[0] = -delx/r*radi;
+        xl[1] = -dely/r*radi;
+        xl[2] = -delz/r*radi;
+        jl[0] = -delx/r*radj;
+        jl[1] = -dely/r*radj;
+        jl[2] = -delz/r*radj;
+
+        // velocity at the point of closest approach on both particles
+        // v = v + omega_cross_xl - Ef.xl
+
+        // particle i
+
+        vi[0] = v[i][0] + (wi[1]*xl[2] - wi[2]*xl[1])
+                        - (Ef[0][0]*xl[0] + Ef[0][1]*xl[1] + Ef[0][2]*xl[2]);
+
+        vi[1] = v[i][1] + (wi[2]*xl[0] - wi[0]*xl[2])
+                        - (Ef[1][0]*xl[0] + Ef[1][1]*xl[1] + Ef[1][2]*xl[2]);
+
+        vi[2] = v[i][2] + (wi[0]*xl[1] - wi[1]*xl[0])
+                        - (Ef[2][0]*xl[0] + Ef[2][1]*xl[1] + Ef[2][2]*xl[2]);
+
+        // particle j
+
+        vj[0] = v[j][0] - (wj[1]*jl[2] - wj[2]*jl[1])
+                        + (Ef[0][0]*jl[0] + Ef[0][1]*jl[1] + Ef[0][2]*jl[2]);
+
+        vj[1] = v[j][1] - (wj[2]*jl[0] - wj[0]*jl[2])
+                        + (Ef[1][0]*jl[0] + Ef[1][1]*jl[1] + Ef[1][2]*jl[2]);
+
+        vj[2] = v[j][2] - (wj[0]*jl[1] - wj[1]*jl[0])
+                        + (Ef[2][0]*jl[0] + Ef[2][1]*jl[1] + Ef[2][2]*jl[2]);
+
+        // scalar resistances XA and YA
+
+        h_sep = r - radi-radj;
+
+        // check for overlaps
+
+        if (h_sep < 0.0) overlaps++;
+
+        // if less than the minimum gap use the minimum gap instead
+
+        if (r < cut_inner[itype][jtype])
+          h_sep = cut_inner[itype][jtype] - radi-radj;
+
+        // scale h_sep by radi
+
+        h_sep = h_sep/radi;
+        beta0 = radj/radi;
+        beta1 = 1.0 + beta0;
+
+        // scalar resistances
+
+        if (FLAGLOG) {
+          a_sq = beta0*beta0/beta1/beta1/h_sep +
+            (1.0+7.0*beta0+beta0*beta0)/5.0/pow(beta1,3.0)*log(1.0/h_sep);
+          a_sq += (1.0+18.0*beta0-29.0*beta0*beta0+18.0 *
+                   pow(beta0,3.0)+pow(beta0,4.0))/21.0/pow(beta1,4.0) *
+            h_sep*log(1.0/h_sep);
+          a_sq *= 6.0*MY_PI*mu*radi;
+          a_sh = 4.0*beta0*(2.0+beta0+2.0*beta0*beta0)/15.0/pow(beta1,3.0) *
+            log(1.0/h_sep);
+          a_sh += 4.0*(16.0-45.0*beta0+58.0*beta0*beta0-45.0*pow(beta0,3.0) +
+                       16.0*pow(beta0,4.0))/375.0/pow(beta1,4.0) *
+            h_sep*log(1.0/h_sep);
+          a_sh *= 6.0*MY_PI*mu*radi;
+          a_pu = beta0*(4.0+beta0)/10.0/beta1/beta1*log(1.0/h_sep);
+          a_pu += (32.0-33.0*beta0+83.0*beta0*beta0+43.0 *
+                   pow(beta0,3.0))/250.0/pow(beta1,3.0)*h_sep*log(1.0/h_sep);
+          a_pu *= 8.0*MY_PI*mu*pow(radi,3.0);
+        } else a_sq = 6.0*MY_PI*mu*radi*(beta0*beta0/beta1/beta1/h_sep);
+
+        // relative velocity at the point of closest approach
+        // includes fluid velocity
+
+        vr1 = vi[0] - vj[0];
+        vr2 = vi[1] - vj[1];
+        vr3 = vi[2] - vj[2];
+
+        // normal component (vr.n)n
+
+        vnnr = (vr1*delx + vr2*dely + vr3*delz)/r;
+        vn1 = vnnr*delx/r;
+        vn2 = vnnr*dely/r;
+        vn3 = vnnr*delz/r;
+
+        // tangential component vr - (vr.n)n
+
+        vt1 = vr1 - vn1;
+        vt2 = vr2 - vn2;
+        vt3 = vr3 - vn3;
+
+        // force due to squeeze type motion
+
+        fx  = a_sq*vn1;
+        fy  = a_sq*vn2;
+        fz  = a_sq*vn3;
+
+        // force due to all shear kind of motions
+
+        if (FLAGLOG) {
+          fx = fx + a_sh*vt1;
+          fy = fy + a_sh*vt2;
+          fz = fz + a_sh*vt3;
+        }
+
+        // scale forces for appropriate units
+
+        fx *= vxmu2f;
+        fy *= vxmu2f;
+        fz *= vxmu2f;
+
+        // add to total force
+
+        f[i][0] -= fx;
+        f[i][1] -= fy;
+        f[i][2] -= fz;
+
+        // torque due to this force
+
+        if (FLAGLOG) {
+          tx = xl[1]*fz - xl[2]*fy;
+          ty = xl[2]*fx - xl[0]*fz;
+          tz = xl[0]*fy - xl[1]*fx;
+
+          torque[i][0] -= vxmu2f*tx;
+          torque[i][1] -= vxmu2f*ty;
+          torque[i][2] -= vxmu2f*tz;
+
+          // torque due to a_pu
+
+          wdotn = ((wi[0]-wj[0])*delx + (wi[1]-wj[1])*dely +
+                   (wi[2]-wj[2])*delz)/r;
+          wt1 = (wi[0]-wj[0]) - wdotn*delx/r;
+          wt2 = (wi[1]-wj[1]) - wdotn*dely/r;
+          wt3 = (wi[2]-wj[2]) - wdotn*delz/r;
+
+          tx = a_pu*wt1;
+          ty = a_pu*wt2;
+          tz = a_pu*wt3;
+
+          torque[i][0] -= vxmu2f*tx;
+          torque[i][1] -= vxmu2f*ty;
+          torque[i][2] -= vxmu2f*tz;
+
+        }
+
+        if (EVFLAG) ev_tally_xyz(i,nlocal,nlocal, /* newton_pair */ 0,
+                                 0.0,0.0,-fx,-fy,-fz,delx,dely,delz);
+      }
+    }
+  }
+
+  // restore streaming component of velocity, omega, angmom
+
+  if (SHEARING) {
+    double *h_rate = domain->h_rate;
+    double *h_ratelo = domain->h_ratelo;
+
+    for (ii = iifrom; ii < iito; ii++) {
+      i = ilist[ii];
+      itype = type[i];
+      radi = radius[i];
+
+      domain->x2lamda(x[i],lamda);
+      vstream[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
+        h_rate[4]*lamda[2] + h_ratelo[0];
+      vstream[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
+      vstream[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+      v[i][0] += vstream[0];
+      v[i][1] += vstream[1];
+      v[i][2] += vstream[2];
+
+      omega[i][0] -= 0.5*h_rate[3];
+      omega[i][1] += 0.5*h_rate[4];
+      omega[i][2] -= 0.5*h_rate[5];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLubricatePolyOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLubricatePoly::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_lubricate_poly_omp.h b/src/USER-OMP/pair_lubricate_poly_omp.h
new file mode 100644
index 0000000000..e9984ce0d4
--- /dev/null
+++ b/src/USER-OMP/pair_lubricate_poly_omp.h
@@ -0,0 +1,49 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lubricate/poly/omp,PairLubricateOMP)
+
+#else
+
+#ifndef LMP_PAIR_LUBRICATE_POLY_OMP_H
+#define LMP_PAIR_LUBRICATE_POLY_OMP_H
+
+#include "pair_lubricate_poly.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLubricatePolyOMP : public PairLubricatePoly, public ThrOMP {
+
+ public:
+  PairLubricatePolyOMP(class LAMMPS *);
+  virtual ~PairLubricatePolyOMP();
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int LOGFLAG, int EVFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_meam_spline_omp.cpp b/src/USER-OMP/pair_meam_spline_omp.cpp
new file mode 100644
index 0000000000..cf9a6c5bc9
--- /dev/null
+++ b/src/USER-OMP/pair_meam_spline_omp.cpp
@@ -0,0 +1,324 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "string.h"
+
+#include "pair_meam_spline_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairMEAMSplineOMP::PairMEAMSplineOMP(LAMMPS *lmp) :
+  PairMEAMSpline(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMEAMSplineOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = eflag_global = vflag_global = eflag_atom = vflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = listfull->inum;
+
+  if (listhalf->inum != inum)
+    error->warning(FLERR,"inconsistent half and full neighborlist");
+
+  // Grow per-atom array if necessary.
+
+  if (atom->nmax > nmax) {
+    memory->destroy(Uprime_values);
+    nmax = atom->nmax;
+    memory->create(Uprime_values,nmax*nthreads,"pair:Uprime");
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    thr->init_eam(nall,Uprime_values);
+
+    if (evflag) {
+      if (eflag) {
+        eval<1,1>(ifrom, ito, thr);
+      } else {
+        eval<1,0>(ifrom, ito, thr);
+      }
+    } else {
+      eval<0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG>
+void PairMEAMSplineOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  const int* const ilist_full = listfull->ilist;
+  const int* const numneigh_full = listfull->numneigh;
+  const int* const * const firstneigh_full = listfull->firstneigh;
+
+  // Determine the maximum number of neighbors a single atom has.
+  int myMaxNeighbors = 0;
+  for(int ii = iifrom; ii < iito; ii++) {
+    int jnum = numneigh_full[ilist_full[ii]];
+    if(jnum > myMaxNeighbors) myMaxNeighbors = jnum;
+  }
+
+  // Allocate array for temporary bond info.
+  MEAM2Body *myTwoBodyInfo = new MEAM2Body[myMaxNeighbors];
+
+  const double * const * const x = atom->x;
+  double * const * const forces = thr->get_f();
+  double * const Uprime_thr = thr->get_rho();
+  const int tid = thr->get_tid();
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  const double cutforcesq = cutoff*cutoff;
+
+  // Sum three-body contributions to charge density and compute embedding energies.
+  for(int ii = iifrom; ii < iito; ii++) {
+
+    const int i = ilist_full[ii];
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    const int* const jlist = firstneigh_full[i];
+    const int jnum = numneigh_full[i];
+    double rho_value = 0;
+    int numBonds = 0;
+    MEAM2Body* nextTwoBodyInfo = myTwoBodyInfo;
+
+    for(int jj = 0; jj < jnum; jj++) {
+      const int j = jlist[jj] & NEIGHMASK;
+
+      const double jdelx = x[j][0] - xtmp;
+      const double jdely = x[j][1] - ytmp;
+      const double jdelz = x[j][2] - ztmp;
+      const double rij_sq = jdelx*jdelx + jdely*jdely + jdelz*jdelz;
+
+      if (rij_sq < cutforcesq) {
+        const double rij = sqrt(rij_sq);
+        double partial_sum = 0;
+
+        nextTwoBodyInfo->tag = j;
+        nextTwoBodyInfo->r = rij;
+        nextTwoBodyInfo->f = f.eval(rij, nextTwoBodyInfo->fprime);
+        nextTwoBodyInfo->del[0] = jdelx / rij;
+        nextTwoBodyInfo->del[1] = jdely / rij;
+        nextTwoBodyInfo->del[2] = jdelz / rij;
+
+        for(int kk = 0; kk < numBonds; kk++) {
+          const MEAM2Body& bondk = myTwoBodyInfo[kk];
+          double cos_theta = (nextTwoBodyInfo->del[0]*bondk.del[0] + nextTwoBodyInfo->del[1]*bondk.del[1] + nextTwoBodyInfo->del[2]*bondk.del[2]);
+          partial_sum += bondk.f * g.eval(cos_theta);
+        }
+
+        rho_value += nextTwoBodyInfo->f * partial_sum;
+        rho_value += rho.eval(rij);
+
+        numBonds++;
+        nextTwoBodyInfo++;
+      }
+    }
+
+    // Compute embedding energy and its derivative.
+    double Uprime_i;
+    double embeddingEnergy = U.eval(rho_value, Uprime_i) - zero_atom_energy;
+    Uprime_thr[i] = Uprime_i;
+    if (EFLAG) {
+      if (eflag_global) eng_vdwl += embeddingEnergy;
+      if (eflag_atom) eatom[i] += embeddingEnergy;
+    }
+
+    double forces_i[3] = {0.0, 0.0, 0.0};
+
+    // Compute three-body contributions to force.
+    for(int jj = 0; jj < numBonds; jj++) {
+      const MEAM2Body bondj = myTwoBodyInfo[jj];
+      const double rij = bondj.r;
+      const int j = bondj.tag;
+
+      const double f_rij_prime = bondj.fprime;
+      const double f_rij = bondj.f;
+
+      double forces_j[3] = {0.0, 0.0, 0.0};
+
+      MEAM2Body const* bondk = myTwoBodyInfo;
+      for(int kk = 0; kk < jj; kk++, ++bondk) {
+        const double rik = bondk->r;
+
+        const double cos_theta = (bondj.del[0]*bondk->del[0] + bondj.del[1]*bondk->del[1] + bondj.del[2]*bondk->del[2]);
+        double g_prime;
+        double g_value = g.eval(cos_theta, g_prime);
+        const double f_rik_prime = bondk->fprime;
+        const double f_rik = bondk->f;
+
+        double fij = -Uprime_i * g_value * f_rik * f_rij_prime;
+        double fik = -Uprime_i * g_value * f_rij * f_rik_prime;
+
+        const double prefactor = Uprime_i * f_rij * f_rik * g_prime;
+        const double prefactor_ij = prefactor / rij;
+        const double prefactor_ik = prefactor / rik;
+        fij += prefactor_ij * cos_theta;
+        fik += prefactor_ik * cos_theta;
+
+        double fj[3], fk[3];
+
+        fj[0] = bondj.del[0] * fij - bondk->del[0] * prefactor_ij;
+        fj[1] = bondj.del[1] * fij - bondk->del[1] * prefactor_ij;
+        fj[2] = bondj.del[2] * fij - bondk->del[2] * prefactor_ij;
+        forces_j[0] += fj[0];
+        forces_j[1] += fj[1];
+        forces_j[2] += fj[2];
+
+        fk[0] = bondk->del[0] * fik - bondj.del[0] * prefactor_ik;
+        fk[1] = bondk->del[1] * fik - bondj.del[1] * prefactor_ik;
+        fk[2] = bondk->del[2] * fik - bondj.del[2] * prefactor_ik;
+        forces_i[0] -= fk[0];
+        forces_i[1] -= fk[1];
+        forces_i[2] -= fk[2];
+
+        const int k = bondk->tag;
+        forces[k][0] += fk[0];
+        forces[k][1] += fk[1];
+        forces[k][2] += fk[2];
+
+        if(EVFLAG) {
+          double delta_ij[3];
+          double delta_ik[3];
+          delta_ij[0] = bondj.del[0] * rij;
+          delta_ij[1] = bondj.del[1] * rij;
+          delta_ij[2] = bondj.del[2] * rij;
+          delta_ik[0] = bondk->del[0] * rik;
+          delta_ik[1] = bondk->del[1] * rik;
+          delta_ik[2] = bondk->del[2] * rik;
+          ev_tally3_thr(this,i,j,k,0.0,0.0,fj,fk,delta_ij,delta_ik,thr);
+        }
+      }
+
+      forces[i][0] -= forces_j[0];
+      forces[i][1] -= forces_j[1];
+      forces[i][2] -= forces_j[2];
+      forces[j][0] += forces_j[0];
+      forces[j][1] += forces_j[1];
+      forces[j][2] += forces_j[2];
+    }
+
+    forces[i][0] += forces_i[0];
+    forces[i][1] += forces_i[1];
+    forces[i][2] += forces_i[2];
+  }
+
+  delete[] myTwoBodyInfo;
+
+  sync_threads();
+
+  // reduce per thread density
+  data_reduce_thr(Uprime_values, nall, nthreads, 1, tid);
+
+  // wait until reduction is complete so that master thread
+  // can communicate U'(rho) values.
+  sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+  { comm->forward_comm_pair(this); }
+
+  // wait until master thread is done with communication
+  sync_threads();
+
+  const int* const ilist_half = listhalf->ilist;
+  const int* const numneigh_half = listhalf->numneigh;
+  const int* const * const firstneigh_half = listhalf->firstneigh;
+
+  // Compute two-body pair interactions.
+  for(int ii = iifrom; ii < iito; ii++) {
+    const int i = ilist_half[ii];
+    const double xtmp = x[i][0];
+    const double ytmp = x[i][1];
+    const double ztmp = x[i][2];
+    const int* const jlist = firstneigh_half[i];
+    const int jnum = numneigh_half[i];
+
+    for(int jj = 0; jj < jnum; jj++) {
+      const int j = jlist[jj] & NEIGHMASK;
+
+      double jdel[3];
+      jdel[0] = x[j][0] - xtmp;
+      jdel[1] = x[j][1] - ytmp;
+      jdel[2] = x[j][2] - ztmp;
+      double rij_sq = jdel[0]*jdel[0] + jdel[1]*jdel[1] + jdel[2]*jdel[2];
+
+      if(rij_sq < cutforcesq) {
+        double rij = sqrt(rij_sq);
+
+        double rho_prime;
+        rho.eval(rij, rho_prime);
+        double fpair = rho_prime * (Uprime_values[i] + Uprime_values[j]);
+
+        double pair_pot_deriv;
+        double pair_pot = phi.eval(rij, pair_pot_deriv);
+        fpair += pair_pot_deriv;
+
+        // Divide by r_ij to get forces from gradient.
+        fpair /= rij;
+
+        forces[i][0] += jdel[0]*fpair;
+        forces[i][1] += jdel[1]*fpair;
+        forces[i][2] += jdel[2]*fpair;
+        forces[j][0] -= jdel[0]*fpair;
+        forces[j][1] -= jdel[1]*fpair;
+        forces[j][2] -= jdel[2]*fpair;
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal, 1 /* newton_pair */,
+                                 pair_pot,0.0,-fpair,jdel[0],jdel[1],jdel[2],thr);
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairMEAMSplineOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairMEAMSpline::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_meam_spline_omp.h b/src/USER-OMP/pair_meam_spline_omp.h
new file mode 100644
index 0000000000..2fd169609f
--- /dev/null
+++ b/src/USER-OMP/pair_meam_spline_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(meam/spline/omp,PairMEAMSplineOMP)
+
+#else
+
+#ifndef LMP_PAIR_MEAM_SPLINE_OMP_H
+#define LMP_PAIR_MEAM_SPLINE_OMP_H
+
+#include "pair_meam_spline.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairMEAMSplineOMP : public PairMEAMSpline, public ThrOMP {
+
+ public:
+  PairMEAMSplineOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG>
+  void eval(int iifrom, int iito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_morse_omp.cpp b/src/USER-OMP/pair_morse_omp.cpp
new file mode 100644
index 0000000000..d1b0d56cfd
--- /dev/null
+++ b/src/USER-OMP/pair_morse_omp.cpp
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_morse_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairMorseOMP::PairMorseOMP(LAMMPS *lmp) :
+  PairMorse(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairMorseOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,dr,dexp,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        dr = r - r0[itype][jtype];
+        dexp = exp(-alpha[itype][jtype] * dr);
+        fpair = factor_lj * morse1[itype][jtype] * (dexp*dexp - dexp) / r;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = d0[itype][jtype] * (dexp*dexp - 2.0*dexp) -
+            offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairMorseOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairMorse::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_morse_omp.h b/src/USER-OMP/pair_morse_omp.h
new file mode 100644
index 0000000000..40797e2502
--- /dev/null
+++ b/src/USER-OMP/pair_morse_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(morse/omp,PairMorseOMP)
+
+#else
+
+#ifndef LMP_PAIR_MORSE_OMP_H
+#define LMP_PAIR_MORSE_OMP_H
+
+#include "pair_morse.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairMorseOMP : public PairMorse, public ThrOMP {
+
+ public:
+  PairMorseOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_peri_lps_omp.cpp b/src/USER-OMP/pair_peri_lps_omp.cpp
new file mode 100644
index 0000000000..d61f71aa06
--- /dev/null
+++ b/src/USER-OMP/pair_peri_lps_omp.cpp
@@ -0,0 +1,453 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "float.h"
+#include "pair_peri_lps_omp.h"
+#include "fix.h"
+#include "fix_peri_neigh.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "memory.h"
+#include "lattice.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairPeriLPSOMP::PairPeriLPSOMP(LAMMPS *lmp) :
+  PairPeriLPS(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairPeriLPSOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = eflag_global = eflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // grow bond forces array if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(s0_new);
+    memory->destroy(theta);
+    nmax = atom->nmax;
+    memory->create(s0_new,nmax,"pair:s0_new");
+    memory->create(theta,nmax,"pair:theta");
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairPeriLPSOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz;
+  double xtmp0,ytmp0,ztmp0,delx0,dely0,delz0,rsq0;
+  double rsq,r,dr,rk,evdwl,fpair,fbond;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double d_ij,delta,stretch;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  double fxtmp,fytmp,fztmp;
+
+  double *vfrac = atom->vfrac;
+  double *s0 = atom->s0;
+  double **x0 = atom->x0;
+  double **r0   = ((FixPeriNeigh *) modify->fix[ifix_peri])->r0;
+  int **partner = ((FixPeriNeigh *) modify->fix[ifix_peri])->partner;
+  int *npartner = ((FixPeriNeigh *) modify->fix[ifix_peri])->npartner;
+  double *wvolume = ((FixPeriNeigh *) modify->fix[ifix_peri])->wvolume;
+
+  // lc = lattice constant
+  // init_style guarantees it's the same in x, y, and z
+
+  double lc = domain->lattice->xlattice;
+  double half_lc = 0.5*lc;
+  double vfrac_scale = 1.0;
+
+  // short-range forces
+
+  int periodic = (domain->xperiodic || domain->yperiodic || domain->zperiodic);
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+  // need minimg() for x0 difference since not ghosted
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    xtmp0 = x0[i][0];
+    ytmp0 = x0[i][1];
+    ztmp0 = x0[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+
+      rsq = delx*delx + dely*dely + delz*delz;
+      delx0 = xtmp0 - x0[j][0];
+      dely0 = ytmp0 - x0[j][1];
+      delz0 = ztmp0 - x0[j][2];
+      if (periodic) domain->minimum_image(delx0,dely0,delz0);
+      rsq0 = delx0*delx0 + dely0*dely0 + delz0*delz0;
+      jtype = type[j];
+
+      r = sqrt(rsq);
+
+      // short-range interaction distance based on initial particle position
+      // 0.9 and 1.35 are constants
+
+      d_ij = MIN(0.9*sqrt(rsq0),1.35*lc);
+
+      // short-range contact forces
+      // 15 is constant taken from the EMU Theory Manual
+      // Silling, 12 May 2005, p 18
+
+      if (r < d_ij) {
+        dr = r - d_ij;
+
+        // kshort based upon short-range force constant
+        // of the bond-based theory used in PMB model
+
+        double kshort = (15.0 * 18.0 * bulkmodulus[itype][itype]) /
+          (MY_PI * cutsq[itype][jtype] * cutsq[itype][jtype]);
+        rk = (kshort * vfrac[j]) * (dr / cut[itype][jtype]);
+
+        if (r > 0.0) fpair = -(rk/r);
+        else fpair = 0.0;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) evdwl = 0.5*rk*dr;
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,evdwl,0.0,
+                                 fpair*vfrac[i],delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+
+  // wait until all threads are done since we
+  // need to distribute the work differently.
+  sync_threads();
+
+#if defined(_OPENMP)
+  // each thread works on a fixed chunk of atoms.
+  const int idelta = 1 + nlocal/comm->nthreads;
+  iifrom = thr->get_tid()*idelta;
+  iito   = ((iifrom + idelta) > nlocal) ? nlocal : (iifrom + idelta);
+#else
+  iifrom = 0;
+  iito = nlocal;
+#endif
+
+  // Compute the dilatation on each particle
+  if (iifrom < nlocal)
+    compute_dilatation_thr(iifrom, iito);
+
+  // wait until all threads are done before communication
+  sync_threads();
+
+#if defined(_OPENMP)
+#pragma omp master
+#endif
+  { // communicate dilatation (theta) of each particle
+    comm->forward_comm_pair(this);
+    // communicate weighted volume (wvolume) upon every reneighbor
+    if (neighbor->ago == 0)
+      comm->forward_comm_fix(modify->fix[ifix_peri]);
+  }
+
+  sync_threads();
+
+  // Volume-dependent part of the energy
+  if (EFLAG) {
+    for (i = iifrom; i < iito; i++) {
+      itype = type[i];
+      e_tally_thr(this, i, i, nlocal, NEWTON_PAIR,
+                  0.5 * bulkmodulus[itype][itype] * (theta[i] * theta[i]), 0.0, thr);
+    }
+  }
+
+  // loop over my particles and their partners
+  // partner list contains all bond partners, so I-J appears twice
+  // if bond already broken, skip this partner
+  // first = true if this is first neighbor of particle i
+
+  bool first;
+  double omega_minus, omega_plus;
+
+  for (i = iifrom; i < iito; ++i) {
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    xtmp0 = x0[i][0];
+    ytmp0 = x0[i][1];
+    ztmp0 = x0[i][2];
+    itype = type[i];
+    jnum = npartner[i];
+    first = true;
+
+    for (jj = 0; jj < jnum; jj++) {
+      if (partner[i][jj] == 0) continue;
+      j = atom->map(partner[i][jj]);
+
+      // check if lost a partner without first breaking bond
+
+      if (j < 0) {
+        partner[i][jj] = 0;
+        continue;
+      }
+
+      // compute force density, add to PD equation of motion
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      if (periodic) domain->minimum_image(delx,dely,delz);
+      rsq = delx*delx + dely*dely + delz*delz;
+      delx0 = xtmp0 - x0[j][0];
+      dely0 = ytmp0 - x0[j][1];
+      delz0 = ztmp0 - x0[j][2];
+      if (periodic) domain->minimum_image(delx0,dely0,delz0);
+      jtype = type[j];
+      delta = cut[itype][jtype];
+      r = sqrt(rsq);
+      dr = r - r0[i][jj];
+
+      // avoid roundoff errors
+
+      if (fabs(dr) < 2.2204e-016) dr = 0.0;
+
+      // scale vfrac[j] if particle j near the horizon
+
+      if ((fabs(r0[i][jj] - delta)) <= half_lc)
+        vfrac_scale = (-1.0/(2*half_lc))*(r0[i][jj]) +
+          (1.0 + ((delta - half_lc)/(2*half_lc) ) );
+      else vfrac_scale = 1.0;
+
+      omega_plus  = influence_function(-1.0*delx0,-1.0*dely0,-1.0*delz0);
+      omega_minus = influence_function(delx0,dely0,delz0);
+      rk = ( (3.0 * bulkmodulus[itype][itype]) -
+             (5.0 * shearmodulus[itype][itype]) ) * vfrac[j] * vfrac_scale *
+        ( (omega_plus * theta[i] / wvolume[i]) +
+          ( omega_minus * theta[j] / wvolume[j] ) ) * r0[i][jj];
+      rk +=  15.0 * ( shearmodulus[itype][itype] * vfrac[j] * vfrac_scale ) *
+        ( (omega_plus / wvolume[i]) + (omega_minus / wvolume[j]) ) * dr;
+
+      if (r > 0.0) fbond = -(rk/r);
+      else fbond = 0.0;
+
+      f[i][0] += delx*fbond;
+      f[i][1] += dely*fbond;
+      f[i][2] += delz*fbond;
+
+      // since I-J is double counted, set newton off & use 1/2 factor and I,I
+
+      double deviatoric_extension = dr - (theta[i]* r0[i][jj] / 3.0);
+      if (EFLAG) evdwl = 0.5 * 15 * (shearmodulus[itype][itype]/wvolume[i]) *
+                   omega_plus*(deviatoric_extension * deviatoric_extension) *
+                   vfrac[j] * vfrac_scale;
+      if (EVFLAG) ev_tally_thr(this,i,i,nlocal,0,0.5*evdwl,0.0,
+                               0.5*fbond*vfrac[i],delx,dely,delz,thr);
+
+      // find stretch in bond I-J and break if necessary
+      // use s0 from previous timestep
+
+      stretch = dr / r0[i][jj];
+      if (stretch > MIN(s0[i],s0[j])) partner[i][jj] = 0;
+
+      // update s0 for next timestep
+
+      if (first)
+         s0_new[i] = s00[itype][jtype] - (alpha[itype][jtype] * stretch);
+      else
+         s0_new[i] = MAX(s0_new[i],s00[itype][jtype] - (alpha[itype][jtype] * stretch));
+
+      first = false;
+    }
+  }
+
+  sync_threads();
+
+  // store new s0 (in parallel)
+  if (iifrom < nlocal)
+    for (i = iifrom; i < iito; i++) s0[i] = s0_new[i];
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairPeriLPSOMP::compute_dilatation_thr(int ifrom, int ito)
+{
+  int i,j,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz;
+  double xtmp0,ytmp0,ztmp0,delx0,dely0,delz0;
+  double rsq,r,dr;
+  double delta;
+
+  double **x = atom->x;
+  int *type = atom->type;
+  double **x0 = atom->x0;
+  double *vfrac = atom->vfrac;
+  double vfrac_scale = 1.0;
+
+  double lc = domain->lattice->xlattice;
+  double half_lc = 0.5*lc;
+
+  double **r0   = ((FixPeriNeigh *) modify->fix[ifix_peri])->r0;
+  int **partner = ((FixPeriNeigh *) modify->fix[ifix_peri])->partner;
+  int *npartner = ((FixPeriNeigh *) modify->fix[ifix_peri])->npartner;
+  double *wvolume = ((FixPeriNeigh *) modify->fix[ifix_peri])->wvolume;
+
+  int periodic = domain->xperiodic || domain->yperiodic || domain->zperiodic;
+
+  // compute the dilatation theta
+
+  for (i = ifrom; i < ito; i++) {
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    xtmp0 = x0[i][0];
+    ytmp0 = x0[i][1];
+    ztmp0 = x0[i][2];
+    jnum = npartner[i];
+    theta[i] = 0.0;
+    itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+
+      // if bond already broken, skip this partner
+      if (partner[i][jj] == 0) continue;
+
+      // Look up local index of this partner particle
+      j = atom->map(partner[i][jj]);
+
+      // Skip if particle is "lost"
+      if (j < 0) continue;
+
+      // Compute force density and add to PD equation of motion
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      if (periodic) domain->minimum_image(delx,dely,delz);
+      rsq = delx*delx + dely*dely + delz*delz;
+      delx0 = xtmp0 - x0[j][0];
+      dely0 = ytmp0 - x0[j][1];
+      delz0 = ztmp0 - x0[j][2];
+      if (periodic) domain->minimum_image(delx0,dely0,delz0);
+
+      r = sqrt(rsq);
+      dr = r - r0[i][jj];
+      if (fabs(dr) < 2.2204e-016) dr = 0.0;
+
+      jtype = type[j];
+      delta = cut[itype][jtype];
+
+      // scale vfrac[j] if particle j near the horizon
+
+      if ((fabs(r0[i][jj] - delta)) <= half_lc)
+        vfrac_scale = (-1.0/(2*half_lc))*(r0[i][jj]) +
+          (1.0 + ((delta - half_lc)/(2*half_lc) ) );
+      else vfrac_scale = 1.0;
+
+      theta[i] += influence_function(delx0, dely0, delz0) * r0[i][jj] * dr *
+        vfrac[j] * vfrac_scale;
+    }
+
+    // if wvolume[i] is zero, then particle i has no bonds
+    // therefore, the dilatation is set to
+
+    if (wvolume[i] != 0.0) theta[i] = (3.0/wvolume[i]) * theta[i];
+    else theta[i] = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairPeriLPSOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairPeriLPS::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_peri_lps_omp.h b/src/USER-OMP/pair_peri_lps_omp.h
new file mode 100644
index 0000000000..aedb51472e
--- /dev/null
+++ b/src/USER-OMP/pair_peri_lps_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(peri/lps/omp,PairPeriLPSOMP)
+
+#else
+
+#ifndef LMP_PAIR_PERI_LPS_OMP_H
+#define LMP_PAIR_PERI_LPS_OMP_H
+
+#include "pair_peri_lps.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairPeriLPSOMP : public PairPeriLPS, public ThrOMP {
+
+ public:
+  PairPeriLPSOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ protected:
+  void compute_dilatation_thr(int ifrom, int ito);
+
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_peri_pmb_omp.cpp b/src/USER-OMP/pair_peri_pmb_omp.cpp
new file mode 100644
index 0000000000..404f1d5695
--- /dev/null
+++ b/src/USER-OMP/pair_peri_pmb_omp.cpp
@@ -0,0 +1,311 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "float.h"
+#include "pair_peri_pmb_omp.h"
+#include "fix.h"
+#include "fix_peri_neigh.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "memory.h"
+#include "lattice.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairPeriPMBOMP::PairPeriPMBOMP(LAMMPS *lmp) :
+ PairPeriPMB(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairPeriPMBOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+  // grow bond forces array if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(s0_new);
+    nmax = atom->nmax;
+    memory->create(s0_new,nmax,"pair:s0_new");
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairPeriPMBOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz;
+  double xtmp0,ytmp0,ztmp0,delx0,dely0,delz0,rsq0;
+  double rsq,r,dr,rk,evdwl,fpair,fbond;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  double d_ij,delta,stretch;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  double fxtmp,fytmp,fztmp;
+
+  double *vfrac = atom->vfrac;
+  double *s0 = atom->s0;
+  double **x0 = atom->x0;
+  double **r0   = ((FixPeriNeigh *) modify->fix[ifix_peri])->r0;
+  int **partner = ((FixPeriNeigh *) modify->fix[ifix_peri])->partner;
+  int *npartner = ((FixPeriNeigh *) modify->fix[ifix_peri])->npartner;
+
+  // lc = lattice constant
+  // init_style guarantees it's the same in x, y, and z
+
+  double lc = domain->lattice->xlattice;
+  double half_lc = 0.5*lc;
+  double vfrac_scale = 1.0;
+
+  // short-range forces
+
+  int periodic = (domain->xperiodic || domain->yperiodic || domain->zperiodic);
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+  // need minimg() for x0 difference since not ghosted
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    xtmp0 = x0[i][0];
+    ytmp0 = x0[i][1];
+    ztmp0 = x0[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+
+      rsq = delx*delx + dely*dely + delz*delz;
+      delx0 = xtmp0 - x0[j][0];
+      dely0 = ytmp0 - x0[j][1];
+      delz0 = ztmp0 - x0[j][2];
+      if (periodic) domain->minimum_image(delx0,dely0,delz0);
+      rsq0 = delx0*delx0 + dely0*dely0 + delz0*delz0;
+      jtype = type[j];
+
+      r = sqrt(rsq);
+
+      // short-range interaction distance based on initial particle position
+      // 0.9 and 1.35 are constants
+
+      d_ij = MIN(0.9*sqrt(rsq0),1.35*lc);
+
+      // short-range contact forces
+      // 15 is constant taken from the EMU Theory Manual
+      // Silling, 12 May 2005, p 18
+
+      if (r < d_ij) {
+        dr = r - d_ij;
+
+        rk = (15.0 * kspring[itype][jtype] * vfrac[j]) *
+          (dr / cut[itype][jtype]);
+        if (r > 0.0) fpair = -(rk/r);
+        else fpair = 0.0;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) evdwl = 0.5*rk*dr;
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,evdwl,0.0,
+                                 fpair*vfrac[i],delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+
+  // wait until all threads are done since we
+  // need to distribute the work differently.
+  sync_threads();
+
+#if defined(_OPENMP)
+  // each thread works on a fixed chunk of atoms.
+  const int idelta = 1 + nlocal/comm->nthreads;
+  iifrom = thr->get_tid()*idelta;
+  iito   = ((iifrom + idelta) > nlocal) ? nlocal : (iifrom + idelta);
+#else
+  iifrom = 0;
+  iito = nlocal;
+#endif
+
+  // loop over my particles and their partners
+  // partner list contains all bond partners, so I-J appears twice
+  // if bond already broken, skip this partner
+  // first = true if this is first neighbor of particle i
+
+  bool first;
+
+  for (i = iifrom; i < iito; ++i) {
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jnum = npartner[i];
+    s0_new[i] = DBL_MAX;
+    first = true;
+
+    for (jj = 0; jj < jnum; jj++) {
+      if (partner[i][jj] == 0) continue;
+      j = atom->map(partner[i][jj]);
+
+      // check if lost a partner without first breaking bond
+
+      if (j < 0) {
+        partner[i][jj] = 0;
+        continue;
+      }
+
+      // compute force density, add to PD equation of motion
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      if (periodic) domain->minimum_image(delx,dely,delz);
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+      delta = cut[itype][jtype];
+      r = sqrt(rsq);
+      dr = r - r0[i][jj];
+
+      // avoid roundoff errors
+
+      if (fabs(dr) < 2.2204e-016) dr = 0.0;
+
+      // scale vfrac[j] if particle j near the horizon
+
+      if ((fabs(r0[i][jj] - delta)) <= half_lc)
+        vfrac_scale = (-1.0/(2*half_lc))*(r0[i][jj]) +
+          (1.0 + ((delta - half_lc)/(2*half_lc) ) );
+      else vfrac_scale = 1.0;
+
+      stretch = dr / r0[i][jj];
+      rk = (kspring[itype][jtype] * vfrac[j]) * vfrac_scale * stretch;
+      if (r > 0.0) fbond = -(rk/r);
+      else fbond = 0.0;
+
+      f[i][0] += delx*fbond;
+      f[i][1] += dely*fbond;
+      f[i][2] += delz*fbond;
+
+      // since I-J is double counted, set newton off & use 1/2 factor and I,I
+
+      if (EFLAG) evdwl = 0.5*rk*dr;
+      if (EVFLAG)
+        ev_tally_thr(this,i,i,nlocal,0,0.5*evdwl,0.0,
+                     0.5*fbond*vfrac[i],delx,dely,delz,thr);
+
+      // find stretch in bond I-J and break if necessary
+      // use s0 from previous timestep
+
+      if (stretch > MIN(s0[i],s0[j])) partner[i][jj] = 0;
+
+      // update s0 for next timestep
+
+      if (first)
+        s0_new[i] = s00[itype][jtype] - (alpha[itype][jtype] * stretch);
+      else
+        s0_new[i] = MAX(s0_new[i],s00[itype][jtype] - (alpha[itype][jtype] * stretch));
+
+      first = false;
+    }
+  }
+
+  sync_threads();
+
+  // store new s0 (in parallel)
+  if (iifrom < nlocal)
+    for (i = iifrom; i < iito; i++) s0[i] = s0_new[i];
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairPeriPMBOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairPeriPMB::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_peri_pmb_omp.h b/src/USER-OMP/pair_peri_pmb_omp.h
new file mode 100644
index 0000000000..cb2d83250e
--- /dev/null
+++ b/src/USER-OMP/pair_peri_pmb_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(peri/pmb/omp,PairPeriPMBOMP)
+
+#else
+
+#ifndef LMP_PAIR_PERI_PMB_OMP_H
+#define LMP_PAIR_PERI_PMB_OMP_H
+
+#include "pair_peri_pmb.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairPeriPMBOMP : public PairPeriPMB, public ThrOMP {
+
+ public:
+  PairPeriPMBOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_rebo_omp.cpp b/src/USER-OMP/pair_rebo_omp.cpp
new file mode 100644
index 0000000000..9a1ce8e11b
--- /dev/null
+++ b/src/USER-OMP/pair_rebo_omp.cpp
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "pair_rebo_omp.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairREBOOMP::PairREBOOMP(LAMMPS *lmp) : PairAIREBOOMP(lmp) {}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairREBOOMP::settings(int narg, char **arg)
+{
+  if (narg != 0) error->all(FLERR,"Illegal pair_style command");
+
+  cutlj = 0.0;
+  ljflag = torflag = 0;
+}
diff --git a/src/USER-OMP/pair_rebo_omp.h b/src/USER-OMP/pair_rebo_omp.h
new file mode 100644
index 0000000000..685c5d26a0
--- /dev/null
+++ b/src/USER-OMP/pair_rebo_omp.h
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(rebo/omp,PairREBOOMP)
+
+#else
+
+#ifndef LMP_PAIR_REBO_OMP_H
+#define LMP_PAIR_REBO_OMP_H
+
+#include "pair_airebo_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairREBOOMP : public PairAIREBOOMP {
+ public:
+  PairREBOOMP(class LAMMPS *);
+  virtual void settings(int, char **);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_resquared_omp.cpp b/src/USER-OMP/pair_resquared_omp.cpp
new file mode 100644
index 0000000000..de05202300
--- /dev/null
+++ b/src/USER-OMP/pair_resquared_omp.cpp
@@ -0,0 +1,215 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_resquared_omp.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "comm.h"
+#include "atom_vec_ellipsoid.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairRESquaredOMP::PairRESquaredOMP(LAMMPS *lmp) :
+  PairRESquared(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairRESquaredOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairRESquaredOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
+  double fforce[3],ttor[3],rtor[3],r12[3];
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  RE2Vars wi,wj;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const tor = thr->get_torque();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+
+  double fxtmp,fytmp,fztmp,t1tmp,t2tmp,t3tmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itype = type[i];
+    fxtmp=fytmp=fztmp=t1tmp=t2tmp=t3tmp=0.0;
+
+    // not a LJ sphere
+
+    if (lshape[itype] != 0.0) precompute_i(i,wi);
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      // r12 = center to center vector
+
+      r12[0] = x[j][0]-x[i][0];
+      r12[1] = x[j][1]-x[i][1];
+      r12[2] = x[j][2]-x[i][2];
+      rsq = MathExtra::dot3(r12,r12);
+      jtype = type[j];
+
+      // compute if less than cutoff
+
+      if (rsq < cutsq[itype][jtype]) {
+        fforce[0] = fforce[1] = fforce[2] = 0.0;
+
+        switch (form[itype][jtype]) {
+
+         case SPHERE_SPHERE:
+          r2inv = 1.0/rsq;
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= -r2inv;
+          if (EFLAG) one_eng =
+              r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
+              offset[itype][jtype];
+          fforce[0] = r12[0]*forcelj;
+          fforce[1] = r12[1]*forcelj;
+          fforce[2] = r12[2]*forcelj;
+          break;
+
+         case SPHERE_ELLIPSE:
+          precompute_i(j,wj);
+          if (NEWTON_PAIR || j < nlocal) {
+            one_eng = resquared_lj(j,i,wj,r12,rsq,fforce,rtor,true);
+            tor[j][0] += rtor[0]*factor_lj;
+            tor[j][1] += rtor[1]*factor_lj;
+            tor[j][2] += rtor[2]*factor_lj;
+          } else
+            one_eng = resquared_lj(j,i,wj,r12,rsq,fforce,rtor,false);
+          break;
+
+         case ELLIPSE_SPHERE:
+          one_eng = resquared_lj(i,j,wi,r12,rsq,fforce,ttor,true);
+          t1tmp += ttor[0]*factor_lj;
+          t2tmp += ttor[1]*factor_lj;
+          t3tmp += ttor[2]*factor_lj;
+          break;
+
+         default:
+          precompute_i(j,wj);
+          one_eng = resquared_analytic(i,j,wi,wj,r12,rsq,fforce,ttor,rtor);
+          t1tmp += ttor[0]*factor_lj;
+          t2tmp += ttor[1]*factor_lj;
+          t3tmp += ttor[2]*factor_lj;
+          if (NEWTON_PAIR || j < nlocal) {
+            tor[j][0] += rtor[0]*factor_lj;
+            tor[j][1] += rtor[1]*factor_lj;
+            tor[j][2] += rtor[2]*factor_lj;
+          }
+         break;
+        }
+
+        fforce[0] *= factor_lj;
+        fforce[1] *= factor_lj;
+        fforce[2] *= factor_lj;
+        fxtmp += fforce[0];
+        fytmp += fforce[1];
+        fztmp += fforce[2];
+
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= fforce[0];
+          f[j][1] -= fforce[1];
+          f[j][2] -= fforce[2];
+        }
+
+        if (EFLAG) evdwl = factor_lj*one_eng;
+
+        if (EVFLAG) ev_tally_xyz_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                     evdwl,0.0,fforce[0],fforce[1],fforce[2],
+                                     -r12[0],-r12[1],-r12[2],thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+    tor[i][0] += t1tmp;
+    tor[i][1] += t2tmp;
+    tor[i][2] += t3tmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairRESquaredOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairRESquared::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_resquared_omp.h b/src/USER-OMP/pair_resquared_omp.h
new file mode 100644
index 0000000000..cc2a59d74a
--- /dev/null
+++ b/src/USER-OMP/pair_resquared_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(resquared/omp,PairRESquaredOMP)
+
+#else
+
+#ifndef LMP_PAIR_RESQUARED_OMP_H
+#define LMP_PAIR_RESQUARED_OMP_H
+
+#include "pair_resquared.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairRESquaredOMP : public PairRESquared, public ThrOMP {
+
+ public:
+  PairRESquaredOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_soft_omp.cpp b/src/USER-OMP/pair_soft_omp.cpp
new file mode 100644
index 0000000000..51a43be28c
--- /dev/null
+++ b/src/USER-OMP/pair_soft_omp.cpp
@@ -0,0 +1,160 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_soft_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "math_const.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL 1.0e-4
+
+/* ---------------------------------------------------------------------- */
+
+PairSoftOMP::PairSoftOMP(LAMMPS *lmp) :
+  PairSoft(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSoftOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairSoftOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double r,rsq,arg,factor_lj;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        arg = MY_PI/cut[itype][jtype];
+        if (r > SMALL) fpair = factor_lj * prefactor[itype][jtype] *
+                       sin(arg*r) * arg/r;
+        else fpair = 0.0;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG)
+          evdwl = factor_lj * prefactor[itype][jtype] * (1.0+cos(arg*r));
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairSoftOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairSoft::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_soft_omp.h b/src/USER-OMP/pair_soft_omp.h
new file mode 100644
index 0000000000..74958208ed
--- /dev/null
+++ b/src/USER-OMP/pair_soft_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(soft/omp,PairSoftOMP)
+
+#else
+
+#ifndef LMP_PAIR_SOFT_OMP_H
+#define LMP_PAIR_SOFT_OMP_H
+
+#include "pair_soft.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairSoftOMP : public PairSoft, public ThrOMP {
+
+ public:
+  PairSoftOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_sw_omp.cpp b/src/USER-OMP/pair_sw_omp.cpp
new file mode 100644
index 0000000000..6bac2420e1
--- /dev/null
+++ b/src/USER-OMP/pair_sw_omp.cpp
@@ -0,0 +1,210 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_sw_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairSWOMP::PairSWOMP(LAMMPS *lmp) :
+  PairSW(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSWOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        eval<1,1>(ifrom, ito, thr);
+      } else {
+        eval<1,0>(ifrom, ito, thr);
+      }
+    } else eval<0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG>
+void PairSWOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,ii,jj,kk,jnum,jnumm1,itag,jtag;
+  int itype,jtype,ktype,ijparam,ikparam,ijkparam;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,rsq1,rsq2;
+  double delr1[3],delr2[3],fj[3],fk[3];
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const tag = atom->tag;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  double fxtmp,fytmp,fztmp;
+
+  // loop over full neighbor list of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itag = tag[i];
+    itype = map[type[i]];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    // two-body interactions, skip half of them
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtag = tag[j];
+
+      if (itag > jtag) {
+        if ((itag+jtag) % 2 == 0) continue;
+      } else if (itag < jtag) {
+        if ((itag+jtag) % 2 == 1) continue;
+      } else {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp && x[j][1] < ytmp) continue;
+        if (x[j][2] == ztmp && x[j][1] == ytmp && x[j][0] < xtmp) continue;
+      }
+
+      jtype = map[type[j]];
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      ijparam = elem2param[itype][jtype][jtype];
+      if (rsq > params[ijparam].cutsq) continue;
+
+      twobody(&params[ijparam],rsq,fpair,EFLAG,evdwl);
+
+      fxtmp += delx*fpair;
+      fytmp += dely*fpair;
+      fztmp += delz*fpair;
+      f[j][0] -= delx*fpair;
+      f[j][1] -= dely*fpair;
+      f[j][2] -= delz*fpair;
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+
+    jnumm1 = jnum - 1;
+
+    for (jj = 0; jj < jnumm1; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = map[type[j]];
+      ijparam = elem2param[itype][jtype][jtype];
+      delr1[0] = x[j][0] - xtmp;
+      delr1[1] = x[j][1] - ytmp;
+      delr1[2] = x[j][2] - ztmp;
+      rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
+      if (rsq1 > params[ijparam].cutsq) continue;
+
+      double fjxtmp,fjytmp,fjztmp;
+      fjxtmp = fjytmp = fjztmp = 0.0;
+
+      for (kk = jj+1; kk < jnum; kk++) {
+        k = jlist[kk];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        ikparam = elem2param[itype][ktype][ktype];
+        ijkparam = elem2param[itype][jtype][ktype];
+
+        delr2[0] = x[k][0] - xtmp;
+        delr2[1] = x[k][1] - ytmp;
+        delr2[2] = x[k][2] - ztmp;
+        rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        if (rsq2 > params[ikparam].cutsq) continue;
+
+        threebody(&params[ijparam],&params[ikparam],&params[ijkparam],
+                  rsq1,rsq2,delr1,delr2,fj,fk,EFLAG,evdwl);
+
+        fxtmp -= fj[0] + fk[0];
+        fytmp -= fj[1] + fk[1];
+        fztmp -= fj[2] + fk[2];
+        fjxtmp += fj[0];
+        fjytmp += fj[1];
+        fjztmp += fj[2];
+        f[k][0] += fk[0];
+        f[k][1] += fk[1];
+        f[k][2] += fk[2];
+
+        if (EVFLAG) ev_tally3_thr(this,i,j,k,evdwl,0.0,fj,fk,delr1,delr2,thr);
+      }
+      f[j][0] += fjxtmp;
+      f[j][1] += fjytmp;
+      f[j][2] += fjztmp;
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairSWOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairSW::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_sw_omp.h b/src/USER-OMP/pair_sw_omp.h
new file mode 100644
index 0000000000..a99edba46e
--- /dev/null
+++ b/src/USER-OMP/pair_sw_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(sw/omp,PairSWOMP)
+
+#else
+
+#ifndef LMP_PAIR_SW_OMP_H
+#define LMP_PAIR_SW_OMP_H
+
+#include "pair_sw.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairSWOMP : public PairSW, public ThrOMP {
+
+ public:
+  PairSWOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_table_omp.cpp b/src/USER-OMP/pair_table_omp.cpp
new file mode 100644
index 0000000000..ee2e447a56
--- /dev/null
+++ b/src/USER-OMP/pair_table_omp.cpp
@@ -0,0 +1,212 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_table_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairTableOMP::PairTableOMP(LAMMPS *lmp) :
+  PairTable(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTableOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairTableOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype,itable;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,factor_lj,fraction,value,a,b;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+  Table *tb;
+
+  union_int_float_t rsq_lookup;
+  int tlm1 = tablength - 1;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int tid = thr->get_tid();
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        tb = &tables[tabindex[itype][jtype]];
+
+        if (check_error_thr((rsq < tb->innersq),tid,
+                            FLERR,"Pair distance < table inner cutoff"))
+          return;
+
+        if (tabstyle == LOOKUP) {
+          itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
+
+          if (check_error_thr((itable >= tlm1),tid,
+                              FLERR,"Pair distance > table outer cutoff"))
+            return;
+
+          fpair = factor_lj * tb->f[itable];
+        } else if (tabstyle == LINEAR) {
+          itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
+
+          if (check_error_thr((itable >= tlm1),tid,
+                              FLERR,"Pair distance > table outer cutoff"))
+            return;
+
+          fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
+          value = tb->f[itable] + fraction*tb->df[itable];
+          fpair = factor_lj * value;
+        } else if (tabstyle == SPLINE) {
+          itable = static_cast<int> ((rsq - tb->innersq) * tb->invdelta);
+
+          if (check_error_thr((itable >= tlm1),tid,
+                              FLERR,"Pair distance > table outer cutoff"))
+            return;
+
+          b = (rsq - tb->rsq[itable]) * tb->invdelta;
+          a = 1.0 - b;
+          value = a * tb->f[itable] + b * tb->f[itable+1] +
+            ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
+            tb->deltasq6;
+          fpair = factor_lj * value;
+        } else {
+          rsq_lookup.f = rsq;
+          itable = rsq_lookup.i & tb->nmask;
+          itable >>= tb->nshiftbits;
+          fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
+          value = tb->f[itable] + fraction*tb->df[itable];
+          fpair = factor_lj * value;
+        }
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (tabstyle == LOOKUP)
+            evdwl = tb->e[itable];
+          else if (tabstyle == LINEAR || tabstyle == BITMAP)
+            evdwl = tb->e[itable] + fraction*tb->de[itable];
+          else
+            evdwl = a * tb->e[itable] + b * tb->e[itable+1] +
+              ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) *
+              tb->deltasq6;
+          evdwl *= factor_lj;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairTableOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairTable::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_table_omp.h b/src/USER-OMP/pair_table_omp.h
new file mode 100644
index 0000000000..dbc0b8e5b9
--- /dev/null
+++ b/src/USER-OMP/pair_table_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(table/omp,PairTableOMP)
+
+#else
+
+#ifndef LMP_PAIR_TABLE_OMP_H
+#define LMP_PAIR_TABLE_OMP_H
+
+#include "pair_table.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairTableOMP : public PairTable, public ThrOMP {
+
+ public:
+  PairTableOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_tersoff_omp.cpp b/src/USER-OMP/pair_tersoff_omp.cpp
new file mode 100644
index 0000000000..573ed92ba3
--- /dev/null
+++ b/src/USER-OMP/pair_tersoff_omp.cpp
@@ -0,0 +1,250 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_tersoff_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairTersoffOMP::PairTersoffOMP(LAMMPS *lmp) :
+  PairTersoff(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTersoffOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = vflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (vflag_atom) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (vflag_atom) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else eval<0,0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+void PairTersoffOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,ii,jj,kk,jnum;
+  int itag,jtag,itype,jtype,ktype,iparam_ij,iparam_ijk;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,rsq1,rsq2;
+  double delr1[3],delr2[3],fi[3],fj[3],fk[3];
+  double zeta_ij,prefactor;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const tag = atom->tag;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  double fxtmp,fytmp,fztmp;
+
+  // loop over full neighbor list of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itag = tag[i];
+    itype = map[type[i]];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    // two-body interactions, skip half of them
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtag = tag[j];
+
+      if (itag > jtag) {
+        if ((itag+jtag) % 2 == 0) continue;
+      } else if (itag < jtag) {
+        if ((itag+jtag) % 2 == 1) continue;
+      } else {
+        if (x[j][2] < ztmp) continue;
+        if (x[j][2] == ztmp && x[j][1] < ytmp) continue;
+        if (x[j][2] == ztmp && x[j][1] == ytmp && x[j][0] < xtmp) continue;
+      }
+
+      jtype = map[type[j]];
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      iparam_ij = elem2param[itype][jtype][jtype];
+      if (rsq > params[iparam_ij].cutsq) continue;
+
+      repulsive(&params[iparam_ij],rsq,fpair,EFLAG,evdwl);
+
+      fxtmp += delx*fpair;
+      fytmp += dely*fpair;
+      fztmp += delz*fpair;
+      f[j][0] -= delx*fpair;
+      f[j][1] -= dely*fpair;
+      f[j][2] -= delz*fpair;
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+
+    // three-body interactions
+    // skip immediately if I-J is not within cutoff
+    double fjxtmp,fjytmp,fjztmp;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = map[type[j]];
+      iparam_ij = elem2param[itype][jtype][jtype];
+
+      delr1[0] = x[j][0] - xtmp;
+      delr1[1] = x[j][1] - ytmp;
+      delr1[2] = x[j][2] - ztmp;
+      rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
+      if (rsq1 > params[iparam_ij].cutsq) continue;
+
+      // accumulate bondorder zeta for each i-j interaction via loop over k
+
+      fjxtmp = fjytmp = fjztmp = 0.0;
+      zeta_ij = 0.0;
+
+      for (kk = 0; kk < jnum; kk++) {
+        if (jj == kk) continue;
+        k = jlist[kk];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        iparam_ijk = elem2param[itype][jtype][ktype];
+
+        delr2[0] = x[k][0] - xtmp;
+        delr2[1] = x[k][1] - ytmp;
+        delr2[2] = x[k][2] - ztmp;
+        rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        if (rsq2 > params[iparam_ijk].cutsq) continue;
+
+        zeta_ij += zeta(&params[iparam_ijk],rsq1,rsq2,delr1,delr2);
+      }
+
+      // pairwise force due to zeta
+
+      force_zeta(&params[iparam_ij],rsq1,zeta_ij,fpair,prefactor,EFLAG,evdwl);
+
+      fxtmp += delr1[0]*fpair;
+      fytmp += delr1[1]*fpair;
+      fztmp += delr1[2]*fpair;
+      fjxtmp -= delr1[0]*fpair;
+      fjytmp -= delr1[1]*fpair;
+      fjztmp -= delr1[2]*fpair;
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,evdwl,0.0,
+                               -fpair,-delr1[0],-delr1[1],-delr1[2],thr);
+
+      // attractive term via loop over k
+
+      for (kk = 0; kk < jnum; kk++) {
+        if (jj == kk) continue;
+        k = jlist[kk];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        iparam_ijk = elem2param[itype][jtype][ktype];
+
+        delr2[0] = x[k][0] - xtmp;
+        delr2[1] = x[k][1] - ytmp;
+        delr2[2] = x[k][2] - ztmp;
+        rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        if (rsq2 > params[iparam_ijk].cutsq) continue;
+
+        attractive(&params[iparam_ijk],prefactor,
+                   rsq1,rsq2,delr1,delr2,fi,fj,fk);
+
+        fxtmp += fi[0];
+        fytmp += fi[1];
+        fztmp += fi[2];
+        fjxtmp += fj[0];
+        fjytmp += fj[1];
+        fjztmp += fj[2];
+        f[k][0] += fk[0];
+        f[k][1] += fk[1];
+        f[k][2] += fk[2];
+
+        if (VFLAG_ATOM) v_tally3_thr(i,j,k,fj,fk,delr1,delr2,thr);
+      }
+      f[j][0] += fjxtmp;
+      f[j][1] += fjytmp;
+      f[j][2] += fjztmp;
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairTersoffOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairTersoff::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_tersoff_omp.h b/src/USER-OMP/pair_tersoff_omp.h
new file mode 100644
index 0000000000..97c20548af
--- /dev/null
+++ b/src/USER-OMP/pair_tersoff_omp.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(tersoff/omp,PairTersoffOMP)
+
+#else
+
+#ifndef LMP_PAIR_TERSOFF_OMP_H
+#define LMP_PAIR_TERSOFF_OMP_H
+
+#include "pair_tersoff.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairTersoffOMP : public PairTersoff, public ThrOMP {
+
+ public:
+  PairTersoffOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_tersoff_table_omp.cpp b/src/USER-OMP/pair_tersoff_table_omp.cpp
new file mode 100644
index 0000000000..4560975b11
--- /dev/null
+++ b/src/USER-OMP/pair_tersoff_table_omp.cpp
@@ -0,0 +1,516 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_tersoff_table_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define GRIDSTART 0.1
+#define GRIDDENSITY_FCUTOFF 5000
+#define GRIDDENSITY_EXP 12000
+#define GRIDDENSITY_GTETA 12000
+#define GRIDDENSITY_BIJ 7500
+
+// max number of interaction per atom for environment potential
+
+#define leadingDimensionInteractionList 64
+
+/* ---------------------------------------------------------------------- */
+
+PairTersoffTableOMP::PairTersoffTableOMP(LAMMPS *lmp) :
+  PairTersoffTable(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTersoffTableOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = vflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag)
+      if (vflag_atom) eval<1,1>(ifrom, ito, thr);
+      else eval<1,0>(ifrom, ito, thr);
+    else eval<0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int VFLAG_ATOM>
+void PairTersoffTableOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,ii,inum,jnum;
+  int itype,jtype,ktype,ijparam,ikparam,ijkparam;
+  double xtmp,ytmp,ztmp;
+  double fxtmp,fytmp,fztmp;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  int interpolIDX;
+  double r_ik_x, r_ik_y, r_ik_z;
+  double directorCos_ij_x, directorCos_ij_y, directorCos_ij_z, directorCos_ik_x, directorCos_ik_y, directorCos_ik_z;
+  double invR_ij, invR_ik, cosTeta;
+  double repulsivePotential, attractivePotential;
+  double exponentRepulsivePotential, exponentAttractivePotential,interpolTMP,interpolDeltaX,interpolY1;
+  double interpolY2, cutoffFunctionIJ, attractiveExponential, repulsiveExponential, cutoffFunctionDerivedIJ,zeta;
+  double gtetaFunctionIJK,gtetaFunctionDerivedIJK,cutoffFunctionIK;
+  double cutoffFunctionDerivedIK,factor_force3_ij,factor_1_force3_ik;
+  double factor_2_force3_ik,betaZetaPowerIJK,betaZetaPowerDerivedIJK,factor_force_tot;
+  double factor_force_ij;
+  double gtetaFunctionDerived_temp,gtetaFunction_temp;
+
+  double evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const int tid = thr->get_tid();
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over full neighbor list of my atoms
+  for (ii = iifrom; ii < iito; ii++) {
+
+    i = ilist[ii];
+    itype = map[type[i]];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    fxtmp = fytmp = fztmp = 0.0;
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    if (check_error_thr((jnum > leadingDimensionInteractionList), tid,
+                        FLERR,"Too many neighbors for interaction list.\n"
+                        "Check your system or increase 'leadingDimension"
+                        "InteractionList'"))
+      return;
+
+    // Pre-calculate gteta and cutoff function
+    for (int neighbor_j = 0; neighbor_j < jnum; neighbor_j++) {
+
+      double dr_ij[3], r_ij;
+
+      j = jlist[neighbor_j];
+      j &= NEIGHMASK;
+
+      dr_ij[0] = xtmp - x[j][0];
+      dr_ij[1] = ytmp - x[j][1];
+      dr_ij[2] = ztmp - x[j][2];
+      r_ij = dr_ij[0]*dr_ij[0] + dr_ij[1]*dr_ij[1] + dr_ij[2]*dr_ij[2];
+
+      jtype = map[type[j]];
+      ijparam = elem2param[itype][jtype][jtype];
+
+      if (r_ij > params[ijparam].cutsq) continue;
+
+      r_ij = sqrt(r_ij);
+
+      invR_ij = 1.0 / r_ij;
+
+      directorCos_ij_x = invR_ij * dr_ij[0];
+      directorCos_ij_y = invR_ij * dr_ij[1];
+      directorCos_ij_z = invR_ij * dr_ij[2];
+
+      // preCutoffFunction
+      interpolDeltaX =  r_ij - GRIDSTART;
+      interpolTMP = (interpolDeltaX * GRIDDENSITY_FCUTOFF);
+      interpolIDX = (int) interpolTMP;
+      interpolY1 = cutoffFunction[itype][jtype][interpolIDX];
+      interpolY2 = cutoffFunction[itype][jtype][interpolIDX+1];
+      preCutoffFunction[neighbor_j] = interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX);
+      // preCutoffFunctionDerived
+      interpolY1 = cutoffFunctionDerived[itype][jtype][interpolIDX];
+      interpolY2 = cutoffFunctionDerived[itype][jtype][interpolIDX+1];
+      preCutoffFunctionDerived[neighbor_j] = interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX);
+
+
+      for (int neighbor_k = neighbor_j + 1; neighbor_k < jnum; neighbor_k++) {
+        double dr_ik[3], r_ik;
+
+        k = jlist[neighbor_k];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        ikparam = elem2param[itype][ktype][ktype];
+        ijkparam = elem2param[itype][jtype][ktype];
+
+        dr_ik[0] = xtmp -x[k][0];
+        dr_ik[1] = ytmp -x[k][1];
+        dr_ik[2] = ztmp -x[k][2];
+        r_ik = dr_ik[0]*dr_ik[0] + dr_ik[1]*dr_ik[1] + dr_ik[2]*dr_ik[2];
+
+        if (r_ik > params[ikparam].cutsq) continue;
+
+        r_ik = sqrt(r_ik);
+
+        invR_ik = 1.0 / r_ik;
+
+        directorCos_ik_x = invR_ik * dr_ik[0];
+        directorCos_ik_y = invR_ik * dr_ik[1];
+        directorCos_ik_z = invR_ik * dr_ik[2];
+
+        cosTeta = directorCos_ij_x * directorCos_ik_x + directorCos_ij_y * directorCos_ik_y + directorCos_ij_z * directorCos_ik_z;
+
+        // preGtetaFunction
+        interpolDeltaX=cosTeta+1.0;
+        interpolTMP = (interpolDeltaX * GRIDDENSITY_GTETA);
+        interpolIDX = (int) interpolTMP;
+        interpolY1 = gtetaFunction[itype][interpolIDX];
+        interpolY2 = gtetaFunction[itype][interpolIDX+1];
+        gtetaFunction_temp = interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX);
+        // preGtetaFunctionDerived
+        interpolY1 = gtetaFunctionDerived[itype][interpolIDX];
+        interpolY2 = gtetaFunctionDerived[itype][interpolIDX+1];
+        gtetaFunctionDerived_temp = interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX);
+
+        preGtetaFunction[neighbor_j][neighbor_k]=params[ijkparam].gamma*gtetaFunction_temp;
+        preGtetaFunctionDerived[neighbor_j][neighbor_k]=params[ijkparam].gamma*gtetaFunctionDerived_temp;
+        preGtetaFunction[neighbor_k][neighbor_j]=params[ijkparam].gamma*gtetaFunction_temp;
+        preGtetaFunctionDerived[neighbor_k][neighbor_j]=params[ijkparam].gamma*gtetaFunctionDerived_temp;
+
+      } // loop on K
+
+    } // loop on J
+
+
+    // loop over neighbours of atom i
+    for (int neighbor_j = 0; neighbor_j < jnum; neighbor_j++) {
+
+      double dr_ij[3], r_ij, f_ij[3];
+
+      j = jlist[neighbor_j];
+      j &= NEIGHMASK;
+
+      dr_ij[0] = xtmp - x[j][0];
+      dr_ij[1] = ytmp - x[j][1];
+      dr_ij[2] = ztmp - x[j][2];
+      r_ij = dr_ij[0]*dr_ij[0] + dr_ij[1]*dr_ij[1] + dr_ij[2]*dr_ij[2];
+
+      jtype = map[type[j]];
+      ijparam = elem2param[itype][jtype][jtype];
+
+      if (r_ij > params[ijparam].cutsq) continue;
+
+      r_ij = sqrt(r_ij);
+      invR_ij = 1.0 / r_ij;
+
+      directorCos_ij_x = invR_ij * dr_ij[0];
+      directorCos_ij_y = invR_ij * dr_ij[1];
+      directorCos_ij_z = invR_ij * dr_ij[2];
+
+      exponentRepulsivePotential = params[ijparam].lam1 * r_ij;
+      exponentAttractivePotential = params[ijparam].lam2 * r_ij;
+
+      // repulsiveExponential
+      interpolDeltaX =  exponentRepulsivePotential - minArgumentExponential;
+      interpolTMP = (interpolDeltaX * GRIDDENSITY_EXP);
+      interpolIDX = (int) interpolTMP;
+      interpolY1 = exponential[interpolIDX];
+      interpolY2 = exponential[interpolIDX+1];
+      repulsiveExponential = interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX);
+      // attractiveExponential
+      interpolDeltaX =  exponentAttractivePotential - minArgumentExponential;
+      interpolTMP = (interpolDeltaX * GRIDDENSITY_EXP);
+      interpolIDX = (int) interpolTMP;
+      interpolY1 = exponential[interpolIDX];
+      interpolY2 = exponential[interpolIDX+1];
+      attractiveExponential = interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX);
+
+      repulsivePotential = params[ijparam].biga * repulsiveExponential;
+      attractivePotential = -params[ijparam].bigb * attractiveExponential;
+
+      cutoffFunctionIJ = preCutoffFunction[neighbor_j];
+      cutoffFunctionDerivedIJ = preCutoffFunctionDerived[neighbor_j];
+
+      zeta = 0.0;
+
+      // first loop over neighbours of atom i except j - part 1/2
+      for (int neighbor_k = 0; neighbor_k < neighbor_j; neighbor_k++) {
+        double dr_ik[3], r_ik;
+
+        k = jlist[neighbor_k];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        ikparam = elem2param[itype][ktype][ktype];
+        ijkparam = elem2param[itype][jtype][ktype];
+
+        dr_ik[0] = xtmp -x[k][0];
+        dr_ik[1] = ytmp -x[k][1];
+        dr_ik[2] = ztmp -x[k][2];
+        r_ik = dr_ik[0]*dr_ik[0] + dr_ik[1]*dr_ik[1] + dr_ik[2]*dr_ik[2];
+
+        if (r_ik > params[ikparam].cutsq) continue;
+
+        r_ik = sqrt(r_ik);
+
+        invR_ik = 1.0 / r_ik;
+
+        directorCos_ik_x = invR_ik * r_ik_x;
+        directorCos_ik_y = invR_ik * r_ik_y;
+        directorCos_ik_z = invR_ik * r_ik_z;
+
+        gtetaFunctionIJK = preGtetaFunction[neighbor_j][neighbor_k];
+
+        cutoffFunctionIK = preCutoffFunction[neighbor_k];
+
+        zeta += cutoffFunctionIK * gtetaFunctionIJK;
+
+      }
+
+      // first loop over neighbours of atom i except j - part 2/2
+      for (int neighbor_k = neighbor_j+1; neighbor_k < jnum; neighbor_k++) {
+        double dr_ik[3], r_ik;
+
+        k = jlist[neighbor_k];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        ikparam = elem2param[itype][ktype][ktype];
+        ijkparam = elem2param[itype][jtype][ktype];
+
+        dr_ik[0] = xtmp -x[k][0];
+        dr_ik[1] = ytmp -x[k][1];
+        dr_ik[2] = ztmp -x[k][2];
+        r_ik = dr_ik[0]*dr_ik[0] + dr_ik[1]*dr_ik[1] + dr_ik[2]*dr_ik[2];
+
+        if (r_ik > params[ikparam].cutsq) continue;
+
+        r_ik = sqrt(r_ik);
+        invR_ik = 1.0 / r_ik;
+
+        directorCos_ik_x = invR_ik * dr_ik[0];
+        directorCos_ik_y = invR_ik * dr_ik[1];
+        directorCos_ik_z = invR_ik * dr_ik[2];
+
+        gtetaFunctionIJK = preGtetaFunction[neighbor_j][neighbor_k];
+
+        cutoffFunctionIK = preCutoffFunction[neighbor_k];
+
+        zeta += cutoffFunctionIK * gtetaFunctionIJK;
+      }
+
+      // betaZetaPowerIJK
+      interpolDeltaX= params[ijparam].beta * zeta;
+      interpolTMP = (interpolDeltaX * GRIDDENSITY_BIJ);
+      interpolIDX = (int) interpolTMP;
+      interpolY1 = betaZetaPower[itype][interpolIDX];
+      interpolY2 = betaZetaPower[itype][interpolIDX+1];
+      betaZetaPowerIJK = (interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX));
+      // betaZetaPowerDerivedIJK
+      interpolY1 = betaZetaPowerDerived[itype][interpolIDX];
+      interpolY2 = betaZetaPowerDerived[itype][interpolIDX+1];
+      betaZetaPowerDerivedIJK = params[ijparam].beta*(interpolY1 + (interpolY2 - interpolY1) * (interpolTMP - interpolIDX));
+
+      // Forces and virial
+      factor_force_ij = 0.5*cutoffFunctionDerivedIJ*(repulsivePotential + attractivePotential * betaZetaPowerIJK)+0.5*cutoffFunctionIJ*(-repulsivePotential*params[ijparam].lam1-betaZetaPowerIJK*attractivePotential*params[ijparam].lam2);
+
+      f_ij[0] = factor_force_ij * directorCos_ij_x;
+      f_ij[1] = factor_force_ij * directorCos_ij_y;
+      f_ij[2] = factor_force_ij * directorCos_ij_z;
+
+      f[j][0] += f_ij[0];
+      f[j][1] += f_ij[1];
+      f[j][2] += f_ij[2];
+
+      fxtmp -= f_ij[0];
+      fytmp -= f_ij[1];
+      fztmp -= f_ij[2];
+
+      // potential energy
+      evdwl = cutoffFunctionIJ * repulsivePotential
+        + cutoffFunctionIJ * attractivePotential * betaZetaPowerIJK;
+
+      if (EVFLAG) ev_tally_thr(this,i, j, nlocal, /* newton_pair */ 1, 0.5 * evdwl, 0.0,
+                               -factor_force_ij*invR_ij, dr_ij[0], dr_ij[1], dr_ij[2],thr);
+
+      factor_force_tot= 0.5*cutoffFunctionIJ*attractivePotential*betaZetaPowerDerivedIJK;
+
+      // second loop over neighbours of atom i except j, forces and virial only - part 1/2
+      for (int neighbor_k = 0; neighbor_k < neighbor_j; neighbor_k++) {
+        double dr_ik[3], r_ik, f_ik[3];
+
+        k = jlist[neighbor_k];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        ikparam = elem2param[itype][ktype][ktype];
+        ijkparam = elem2param[itype][jtype][ktype];
+
+        dr_ik[0] = xtmp -x[k][0];
+        dr_ik[1] = ytmp -x[k][1];
+        dr_ik[2] = ztmp -x[k][2];
+        r_ik = dr_ik[0]*dr_ik[0] + dr_ik[1]*dr_ik[1] + dr_ik[2]*dr_ik[2];
+
+        if (r_ik > params[ikparam].cutsq) continue;
+
+        r_ik = sqrt(r_ik);
+        invR_ik = 1.0 / r_ik;
+
+        directorCos_ik_x = invR_ik * dr_ik[0];
+        directorCos_ik_y = invR_ik * dr_ik[1];
+        directorCos_ik_z = invR_ik * dr_ik[2];
+
+        cosTeta = directorCos_ij_x * directorCos_ik_x + directorCos_ij_y * directorCos_ik_y
+          + directorCos_ij_z * directorCos_ik_z;
+
+        gtetaFunctionIJK = preGtetaFunction[neighbor_j][neighbor_k];
+
+        gtetaFunctionDerivedIJK = preGtetaFunctionDerived[neighbor_j][neighbor_k];
+
+        cutoffFunctionIK = preCutoffFunction[neighbor_k];
+
+        cutoffFunctionDerivedIK = preCutoffFunctionDerived[neighbor_k];
+
+        factor_force3_ij= cutoffFunctionIK * gtetaFunctionDerivedIJK * invR_ij *factor_force_tot;
+
+        f_ij[0] = factor_force3_ij * (directorCos_ij_x*cosTeta - directorCos_ik_x);
+        f_ij[1] = factor_force3_ij * (directorCos_ij_y*cosTeta - directorCos_ik_y);
+        f_ij[2] = factor_force3_ij * (directorCos_ij_z*cosTeta - directorCos_ik_z);
+
+        factor_1_force3_ik = (cutoffFunctionIK * gtetaFunctionDerivedIJK * invR_ik)*factor_force_tot;
+        factor_2_force3_ik = -(cutoffFunctionDerivedIK * gtetaFunctionIJK)*factor_force_tot;
+
+        f_ik[0] = factor_1_force3_ik * (directorCos_ik_x*cosTeta - directorCos_ij_x)
+          + factor_2_force3_ik * directorCos_ik_x;
+        f_ik[1] = factor_1_force3_ik * (directorCos_ik_y*cosTeta - directorCos_ij_y)
+          + factor_2_force3_ik * directorCos_ik_y;
+        f_ik[2] = factor_1_force3_ik * (directorCos_ik_z*cosTeta - directorCos_ij_z)
+          + factor_2_force3_ik * directorCos_ik_z;
+
+        f[j][0] -= f_ij[0];
+        f[j][1] -= f_ij[1];
+        f[j][2] -= f_ij[2];
+
+        f[k][0] -= f_ik[0];
+        f[k][1] -= f_ik[1];
+        f[k][2] -= f_ik[2];
+
+        fxtmp += f_ij[0] + f_ik[0];
+        fytmp += f_ij[1] + f_ik[1];
+        fztmp += f_ij[2] + f_ik[2];
+
+        if (VFLAG_ATOM) v_tally3_thr(i,j,k,f_ij,f_ik,dr_ij,dr_ik,thr);
+      }
+
+      // second loop over neighbours of atom i except j, forces and virial only - part 2/2
+      for (int neighbor_k = neighbor_j+1; neighbor_k < jnum; neighbor_k++) {
+        double dr_ik[3], r_ik, f_ik[3];
+
+        k = jlist[neighbor_k];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        ikparam = elem2param[itype][ktype][ktype];
+        ijkparam = elem2param[itype][jtype][ktype];
+
+        dr_ik[0] = xtmp -x[k][0];
+        dr_ik[1] = ytmp -x[k][1];
+        dr_ik[2] = ztmp -x[k][2];
+        r_ik = dr_ik[0]*dr_ik[0] + dr_ik[1]*dr_ik[1] + dr_ik[2]*dr_ik[2];
+
+        if (r_ik > params[ikparam].cutsq) continue;
+
+        r_ik = sqrt(r_ik);
+        invR_ik = 1.0 / r_ik;
+
+        directorCos_ik_x = invR_ik * dr_ik[0];
+        directorCos_ik_y = invR_ik * dr_ik[1];
+        directorCos_ik_z = invR_ik * dr_ik[2];
+
+        cosTeta = directorCos_ij_x * directorCos_ik_x + directorCos_ij_y * directorCos_ik_y
+          + directorCos_ij_z * directorCos_ik_z;
+
+        gtetaFunctionIJK = preGtetaFunction[neighbor_j][neighbor_k];
+
+        gtetaFunctionDerivedIJK = preGtetaFunctionDerived[neighbor_j][neighbor_k];
+
+        cutoffFunctionIK = preCutoffFunction[neighbor_k];
+
+        cutoffFunctionDerivedIK = preCutoffFunctionDerived[neighbor_k];
+
+        factor_force3_ij= cutoffFunctionIK * gtetaFunctionDerivedIJK * invR_ij *factor_force_tot;
+
+        f_ij[0] = factor_force3_ij * (directorCos_ij_x*cosTeta - directorCos_ik_x);
+        f_ij[1] = factor_force3_ij * (directorCos_ij_y*cosTeta - directorCos_ik_y);
+        f_ij[2] = factor_force3_ij * (directorCos_ij_z*cosTeta - directorCos_ik_z);
+
+        factor_1_force3_ik = (cutoffFunctionIK * gtetaFunctionDerivedIJK * invR_ik)*factor_force_tot;
+        factor_2_force3_ik = -(cutoffFunctionDerivedIK * gtetaFunctionIJK)*factor_force_tot;
+
+        f_ik[0] = factor_1_force3_ik * (directorCos_ik_x*cosTeta - directorCos_ij_x)
+          + factor_2_force3_ik * directorCos_ik_x;
+        f_ik[1] = factor_1_force3_ik * (directorCos_ik_y*cosTeta - directorCos_ij_y)
+          + factor_2_force3_ik * directorCos_ik_y;
+        f_ik[2] = factor_1_force3_ik * (directorCos_ik_z*cosTeta - directorCos_ij_z)
+          + factor_2_force3_ik * directorCos_ik_z;
+
+        f[j][0] -= f_ij[0];
+        f[j][1] -= f_ij[1];
+        f[j][2] -= f_ij[2];
+
+        f[k][0] -= f_ik[0];
+        f[k][1] -= f_ik[1];
+        f[k][2] -= f_ik[2];
+
+        fxtmp += f_ij[0] + f_ik[0];
+        fytmp += f_ij[1] + f_ik[1];
+        fztmp += f_ij[2] + f_ik[2];
+
+        if (VFLAG_ATOM) v_tally3_thr(i,j,k,f_ij,f_ik,dr_ij,dr_ik,thr);
+
+      }
+    } // loop on J
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  } // loop on I
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairTersoffTableOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairTersoffTable::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_tersoff_table_omp.h b/src/USER-OMP/pair_tersoff_table_omp.h
new file mode 100644
index 0000000000..ffde58215d
--- /dev/null
+++ b/src/USER-OMP/pair_tersoff_table_omp.h
@@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(tersoff/table/omp,PairTersoffTableOMP)
+
+#else
+
+#ifndef LMP_PAIR_TERSOFF_TABLE_OMP_H
+#define LMP_PAIR_TERSOFF_TABLE_OMP_H
+
+#include "pair_tersoff_table.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairTersoffTableOMP : public PairTersoffTable, public ThrOMP {
+
+ public:
+  PairTersoffTableOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int VFLAG_ATOM>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_tersoff_zbl_omp.cpp b/src/USER-OMP/pair_tersoff_zbl_omp.cpp
new file mode 100644
index 0000000000..f143312e0a
--- /dev/null
+++ b/src/USER-OMP/pair_tersoff_zbl_omp.cpp
@@ -0,0 +1,295 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Aidan Thompson (SNL) - original Tersoff implementation
+                        David Farrell (NWU) - ZBL addition
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "string.h"
+#include "pair_tersoff_zbl_omp.h"
+#include "atom.h"
+#include "update.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "force.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+#include "math_const.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define MAXLINE 1024
+#define DELTA 4
+
+/* ----------------------------------------------------------------------
+   Fermi-like smoothing function
+------------------------------------------------------------------------- */
+
+static double F_fermi(const double r, const double expsc, const double cut)
+{
+  return 1.0 / (1.0 + exp(-expsc*(r-cut)));
+}
+
+/* ----------------------------------------------------------------------
+   Fermi-like smoothing function derivative with respect to r
+------------------------------------------------------------------------- */
+
+static double F_fermi_d(const double r, const double expsc, const double cut)
+{
+  return expsc*exp(-expsc*(r-cut)) / pow(1.0 + exp(-expsc*(r-cut)),2.0);
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairTersoffZBLOMP::PairTersoffZBLOMP(LAMMPS *lmp) : PairTersoffOMP(lmp)
+{
+  // hard-wired constants in metal or real units
+  // a0 = Bohr radius
+  // epsilon0 = permittivity of vacuum = q / energy-distance units
+  // e = unit charge
+  // 1 Kcal/mole = 0.043365121 eV
+
+  if (strcmp(update->unit_style,"metal") == 0) {
+    global_a_0 = 0.529;
+    global_epsilon_0 = 0.00552635;
+    global_e = 1.0;
+  } else if (strcmp(update->unit_style,"real") == 0) {
+    global_a_0 = 0.529;
+    global_epsilon_0 = 0.00552635 * 0.043365121;
+    global_e = 1.0;
+  } else error->all(FLERR,"Pair tersoff/zbl requires metal or real units");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTersoffZBLOMP::read_file(char *file)
+{
+  int params_per_line = 21;
+  char **words = new char*[params_per_line+1];
+
+  memory->sfree(params);
+  params = NULL;
+  nparams = 0;
+
+  // open file on proc 0
+
+  FILE *fp;
+  if (comm->me == 0) {
+    fp = fopen(file,"r");
+    if (fp == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open Tersoff potential file %s",file);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read each line out of file, skipping blank lines or leading '#'
+  // store line of params if all 3 element tags are in element list
+
+  int n,nwords,ielement,jelement,kelement;
+  char line[MAXLINE],*ptr;
+  int eof = 0;
+
+  while (1) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fp);
+      if (ptr == NULL) {
+        eof = 1;
+        fclose(fp);
+      } else n = strlen(line) + 1;
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(&n,1,MPI_INT,0,world);
+    MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    if (ptr = strchr(line,'#')) *ptr = '\0';
+    nwords = atom->count_words(line);
+    if (nwords == 0) continue;
+
+    // concatenate additional lines until have params_per_line words
+
+    while (nwords < params_per_line) {
+      n = strlen(line);
+      if (comm->me == 0) {
+        ptr = fgets(&line[n],MAXLINE-n,fp);
+        if (ptr == NULL) {
+          eof = 1;
+          fclose(fp);
+        } else n = strlen(line) + 1;
+      }
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof) break;
+      MPI_Bcast(&n,1,MPI_INT,0,world);
+      MPI_Bcast(line,n,MPI_CHAR,0,world);
+      if (ptr = strchr(line,'#')) *ptr = '\0';
+      nwords = atom->count_words(line);
+    }
+
+    if (nwords != params_per_line)
+      error->all(FLERR,"Incorrect format in Tersoff potential file");
+
+    // words = ptrs to all words in line
+
+    nwords = 0;
+    words[nwords++] = strtok(line," \t\n\r\f");
+    while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
+
+    // ielement,jelement,kelement = 1st args
+    // if all 3 args are in element list, then parse this line
+    // else skip to next line
+
+    for (ielement = 0; ielement < nelements; ielement++)
+      if (strcmp(words[0],elements[ielement]) == 0) break;
+    if (ielement == nelements) continue;
+    for (jelement = 0; jelement < nelements; jelement++)
+      if (strcmp(words[1],elements[jelement]) == 0) break;
+    if (jelement == nelements) continue;
+    for (kelement = 0; kelement < nelements; kelement++)
+      if (strcmp(words[2],elements[kelement]) == 0) break;
+    if (kelement == nelements) continue;
+
+    // load up parameter settings and error check their values
+
+    if (nparams == maxparam) {
+      maxparam += DELTA;
+      params = (Param *) memory->srealloc(params,maxparam*sizeof(Param),
+                                          "pair:params");
+    }
+
+    params[nparams].ielement = ielement;
+    params[nparams].jelement = jelement;
+    params[nparams].kelement = kelement;
+    params[nparams].powerm = atof(words[3]);
+    params[nparams].gamma = atof(words[4]);
+    params[nparams].lam3 = atof(words[5]);
+    params[nparams].c = atof(words[6]);
+    params[nparams].d = atof(words[7]);
+    params[nparams].h = atof(words[8]);
+    params[nparams].powern = atof(words[9]);
+    params[nparams].beta = atof(words[10]);
+    params[nparams].lam2 = atof(words[11]);
+    params[nparams].bigb = atof(words[12]);
+    params[nparams].bigr = atof(words[13]);
+    params[nparams].bigd = atof(words[14]);
+    params[nparams].lam1 = atof(words[15]);
+    params[nparams].biga = atof(words[16]);
+    params[nparams].Z_i = atof(words[17]);
+    params[nparams].Z_j = atof(words[18]);
+    params[nparams].ZBLcut = atof(words[19]);
+    params[nparams].ZBLexpscale = atof(words[20]);
+
+    // currently only allow m exponent of 1 or 3
+
+    params[nparams].powermint = int(params[nparams].powerm);
+
+    if (
+        params[nparams].lam3 < 0.0 || params[nparams].c < 0.0 ||
+        params[nparams].d < 0.0 || params[nparams].powern < 0.0 ||
+        params[nparams].beta < 0.0 || params[nparams].lam2 < 0.0 ||
+        params[nparams].bigb < 0.0 || params[nparams].bigr < 0.0 ||
+        params[nparams].bigd < 0.0 ||
+        params[nparams].bigd > params[nparams].bigr ||
+        params[nparams].lam3 < 0.0 || params[nparams].biga < 0.0 ||
+        params[nparams].powerm - params[nparams].powermint != 0.0 ||
+        (params[nparams].powermint != 3 && params[nparams].powermint != 1) ||
+        params[nparams].gamma < 0.0 ||
+        params[nparams].Z_i < 1.0 || params[nparams].Z_j < 1.0 ||
+        params[nparams].ZBLcut < 0.0 || params[nparams].ZBLexpscale < 0.0)
+      error->all(FLERR,"Illegal Tersoff parameter");
+
+    nparams++;
+  }
+
+  delete [] words;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTersoffZBLOMP::force_zeta(Param *param, double rsq, double zeta_ij,
+                                   double &fforce, double &prefactor,
+                                   int eflag, double &eng)
+{
+  double r,fa,fa_d,bij;
+
+  r = sqrt(rsq);
+
+  fa = (r > param->bigr + param->bigd) ? 0.0 :
+    -param->bigb * exp(-param->lam2 * r) * ters_fc(r,param) *
+    F_fermi(r,param->ZBLexpscale,param->ZBLcut);
+
+  fa_d = (r > param->bigr + param->bigd) ? 0.0 :
+    param->bigb * exp(-param->lam2 * r) *
+    (param->lam2 * ters_fc(r,param) *
+     F_fermi(r,param->ZBLexpscale,param->ZBLcut) -
+     ters_fc_d(r,param) * F_fermi(r,param->ZBLexpscale,param->ZBLcut)
+     - ters_fc(r,param) * F_fermi_d(r,param->ZBLexpscale,param->ZBLcut));
+
+  bij = ters_bij(zeta_ij,param);
+  fforce = 0.5*bij*fa_d / r;
+  prefactor = -0.5*fa * ters_bij_d(zeta_ij,param);
+  if (eflag) eng = 0.5*bij*fa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTersoffZBLOMP::repulsive(Param *param, double rsq, double &fforce,
+                               int eflag, double &eng)
+{
+  double r,tmp_fc,tmp_fc_d,tmp_exp;
+
+  // Tersoff repulsive portion
+
+  r = sqrt(rsq);
+  tmp_fc = ters_fc(r,param);
+  tmp_fc_d = ters_fc_d(r,param);
+  tmp_exp = exp(-param->lam1 * r);
+  double fforce_ters = param->biga * tmp_exp * (tmp_fc_d - tmp_fc*param->lam1);
+  double eng_ters = tmp_fc * param->biga * tmp_exp;
+
+  // ZBL repulsive portion
+
+  double esq = pow(global_e,2.0);
+  double a_ij = (0.8854*global_a_0) /
+    (pow(param->Z_i,0.23) + pow(param->Z_j,0.23));
+  double premult = (param->Z_i * param->Z_j * esq)/(4.0*MY_PI*global_epsilon_0);
+  double r_ov_a = r/a_ij;
+  double phi = 0.1818*exp(-3.2*r_ov_a) + 0.5099*exp(-0.9423*r_ov_a) +
+    0.2802*exp(-0.4029*r_ov_a) + 0.02817*exp(-0.2016*r_ov_a);
+  double dphi = (1.0/a_ij) * (-3.2*0.1818*exp(-3.2*r_ov_a) -
+                              0.9423*0.5099*exp(-0.9423*r_ov_a) -
+                              0.4029*0.2802*exp(-0.4029*r_ov_a) -
+                              0.2016*0.02817*exp(-0.2016*r_ov_a));
+  double fforce_ZBL = premult*-pow(r,-2.0)* phi + premult*pow(r,-1.0)*dphi;
+  double eng_ZBL = premult*(1.0/r)*phi;
+
+  // combine two parts with smoothing by Fermi-like function
+
+  fforce = -(-F_fermi_d(r,param->ZBLexpscale,param->ZBLcut) * eng_ZBL +
+             (1.0 - F_fermi(r,param->ZBLexpscale,param->ZBLcut))*fforce_ZBL +
+             F_fermi_d(r,param->ZBLexpscale,param->ZBLcut)*eng_ters +
+             F_fermi(r,param->ZBLexpscale,param->ZBLcut)*fforce_ters) / r;
+
+  if (eflag)
+    eng = (1.0 - F_fermi(r,param->ZBLexpscale,param->ZBLcut))*eng_ZBL +
+      F_fermi(r,param->ZBLexpscale,param->ZBLcut)*eng_ters;
+}
diff --git a/src/USER-OMP/pair_tersoff_zbl_omp.h b/src/USER-OMP/pair_tersoff_zbl_omp.h
new file mode 100644
index 0000000000..32d7b6b4c4
--- /dev/null
+++ b/src/USER-OMP/pair_tersoff_zbl_omp.h
@@ -0,0 +1,45 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(tersoff/zbl/omp,PairTersoffZBLOMP)
+
+#else
+
+#ifndef LMP_PAIR_TERSOFF_ZBL_OMP_H
+#define LMP_PAIR_TERSOFF_ZBL_OMP_H
+
+#include "pair_tersoff_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairTersoffZBLOMP : public PairTersoffOMP {
+ public:
+  PairTersoffZBLOMP(class LAMMPS *);
+  virtual ~PairTersoffZBLOMP() {}
+
+ protected:
+  double global_a_0;                // Bohr radius for Coulomb repulsion
+  double global_epsilon_0;        // permittivity of vacuum for Coulomb repulsion
+  double global_e;                // proton charge (negative of electron charge)
+
+  virtual void read_file(char *);
+  virtual void repulsive(Param *, double, double &, int, double &);
+  virtual void force_zeta(Param *, double, double, double &, double &, int, double &);
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_tri_lj_omp.cpp b/src/USER-OMP/pair_tri_lj_omp.cpp
new file mode 100644
index 0000000000..16dce231ba
--- /dev/null
+++ b/src/USER-OMP/pair_tri_lj_omp.cpp
@@ -0,0 +1,411 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_tri_lj_omp.h"
+#include "math_extra.h"
+#include "atom.h"
+#include "atom_vec_tri.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include <string.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairTriLJOMP::PairTriLJOMP(LAMMPS *lmp) :
+  PairTriLJ(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTriLJOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+  const int * const tri = atom->tri;
+  const int * const type = atom->type;
+  AtomVecTri::Bonus * const bonus = avec->bonus;
+
+  // grow discrete list if necessary and initialize
+
+  if (nall > nmax) {
+    nmax = nall;
+    memory->destroy(dnum);
+    memory->destroy(dfirst);
+    memory->create(dnum,nall,"pair:dnum");
+    memory->create(dfirst,nall,"pair:dfirst");
+  }
+  memset(dnum,0,nall*sizeof(int));
+  ndiscrete = 0;
+
+  // need to discretize the system ahead of time
+  // until we find a good way to multi-thread it.
+  for (int i = 0; i < nall; ++i) {
+    double dc1[3],dc2[3],dc3[3],p[3][3];
+
+    if (tri[i] >= 0) {
+      if (dnum[i] == 0) {
+        MathExtra::quat_to_mat(bonus[tri[i]].quat,p);
+        MathExtra::matvec(p,bonus[tri[i]].c1,dc1);
+        MathExtra::matvec(p,bonus[tri[i]].c2,dc2);
+        MathExtra::matvec(p,bonus[tri[i]].c3,dc3);
+        dfirst[i] = ndiscrete;
+        discretize(i,sigma[type[i]][type[i]],dc1,dc2,dc3);
+        dnum[i] = ndiscrete - dfirst[i];
+      }
+    }
+  }
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairTriLJOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  int ni,nj,npi,npj,ifirst,jfirst;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,term1,term2,sig,sig3,forcelj;
+  double dxi,dxj,dyi,dyj,dzi,dzj;
+  double xi[3],xj[3],fi[3],fj[3],ti[3],tj[3];
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  double * const * const torque = thr->get_torque();
+  const int * const tri = atom->tri;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq >= cutsq[itype][jtype]) continue;
+
+      // tri/tri interactions = NxN particles
+      // c1,c2,c3 = corner pts of triangle I or J
+
+      evdwl = 0.0;
+      if (tri[i] >= 0 && tri[j] >= 0) {
+        npi = dnum[i];
+        ifirst = dfirst[i];
+        npj = dnum[j];
+        jfirst = dfirst[j];
+
+        fi[0]=fi[1]=fi[2]=fj[0]=fj[1]=fj[2]=0.0;
+        ti[0]=ti[1]=ti[2]=tj[0]=tj[1]=tj[2]=0.0;
+
+        for (ni = 0; ni < npi; ni++) {
+          dxi = discrete[ifirst+ni].dx;
+          dyi = discrete[ifirst+ni].dy;
+          dzi = discrete[ifirst+ni].dz;
+
+          for (nj = 0; nj < npj; nj++) {
+            dxj = discrete[jfirst+nj].dx;
+            dyj = discrete[jfirst+nj].dy;
+            dzj = discrete[jfirst+nj].dz;
+
+            xi[0] = x[i][0] + dxi;
+            xi[1] = x[i][1] + dyi;
+            xi[2] = x[i][2] + dzi;
+            xj[0] = x[j][0] + dxj;
+            xj[1] = x[j][1] + dyj;
+            xj[2] = x[j][2] + dzj;
+
+            delx = xi[0] - xj[0];
+            dely = xi[1] - xj[1];
+            delz = xi[2] - xj[2];
+            rsq = delx*delx + dely*dely + delz*delz;
+
+            sig = 0.5 * (discrete[ifirst+ni].sigma+discrete[jfirst+nj].sigma);
+            sig3 = sig*sig*sig;
+            term2 = 24.0*epsilon[itype][jtype] * sig3*sig3;
+            term1 = 2.0 * term2 * sig3*sig3;
+            r2inv = 1.0/rsq;
+            r6inv = r2inv*r2inv*r2inv;
+            forcelj = r6inv * (term1*r6inv - term2);
+            fpair = forcelj*r2inv;
+
+            if (EFLAG) evdwl += r6inv*(term1/12.0*r6inv-term2/6.0);
+
+            fi[0] += delx*fpair;
+            fi[1] += dely*fpair;
+            fi[2] += delz*fpair;
+            ti[0] += fpair*(dyi*delz - dzi*dely);
+            ti[1] += fpair*(dzi*delx - dxi*delz);
+            ti[2] += fpair*(dxi*dely - dyi*delx);
+
+            if (NEWTON_PAIR || j < nlocal) {
+              fj[0] -= delx*fpair;
+              fj[1] -= dely*fpair;
+              fj[2] -= delz*fpair;
+              tj[0] -= fpair*(dyj*delz - dzj*dely);
+              tj[1] -= fpair*(dzj*delx - dxj*delz);
+              tj[2] -= fpair*(dxj*dely - dyj*delx);
+            }
+          }
+        }
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        f[j][0] += fj[0];
+        f[j][1] += fj[1];
+        f[j][2] += fj[2];
+        torque[i][0] += ti[0];
+        torque[i][1] += ti[1];
+        torque[i][2] += ti[2];
+        torque[j][0] += tj[0];
+        torque[j][1] += tj[1];
+        torque[j][2] += tj[2];
+
+        // tri/particle interaction = Nx1 particles
+        // c1,c2,c3 = corner pts of triangle I
+
+      } else if (tri[i] >= 0) {
+        npi = dnum[i];
+        ifirst = dfirst[i];
+
+        fi[0]=fi[1]=fi[2]=fj[0]=fj[1]=fj[2]=0.0;
+        ti[0]=ti[1]=ti[2]=tj[0]=tj[1]=tj[2]=0.0;
+
+        for (ni = 0; ni < npi; ni++) {
+          dxi = discrete[ifirst+ni].dx;
+          dyi = discrete[ifirst+ni].dy;
+          dzi = discrete[ifirst+ni].dz;
+
+          xi[0] = x[i][0] + dxi;
+          xi[1] = x[i][1] + dyi;
+          xi[2] = x[i][2] + dzi;
+          xj[0] = x[j][0];
+          xj[1] = x[j][1];
+          xj[2] = x[j][2];
+
+          delx = xi[0] - xj[0];
+          dely = xi[1] - xj[1];
+          delz = xi[2] - xj[2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          sig = 0.5 * (discrete[ifirst+ni].sigma+sigma[jtype][jtype]);
+          sig3 = sig*sig*sig;
+          term2 = 24.0*epsilon[itype][jtype] * sig3*sig3;
+          term1 = 2.0 * term2 * sig3*sig3;
+          r2inv = 1.0/rsq;
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (term1*r6inv - term2);
+          fpair = forcelj*r2inv;
+
+          if (EFLAG) evdwl += r6inv*(term1/12.0*r6inv-term2/6.0);
+
+          fi[0] += delx*fpair;
+          fi[1] += dely*fpair;
+          fi[2] += delz*fpair;
+          ti[0] += fpair*(dyi*delz - dzi*dely);
+          ti[1] += fpair*(dzi*delx - dxi*delz);
+          ti[2] += fpair*(dxi*dely - dyi*delx);
+
+          if (NEWTON_PAIR || j < nlocal) {
+            fj[0] -= delx*fpair;
+            fj[1] -= dely*fpair;
+            fj[2] -= delz*fpair;
+            tj[0] -= fpair*(dyj*delz - dzj*dely);
+            tj[1] -= fpair*(dzj*delx - dxj*delz);
+            tj[2] -= fpair*(dxj*dely - dyj*delx);
+          }
+        }
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        f[j][0] += fj[0];
+        f[j][1] += fj[1];
+        f[j][2] += fj[2];
+        torque[i][0] += ti[0];
+        torque[i][1] += ti[1];
+        torque[i][2] += ti[2];
+        torque[j][0] += tj[0];
+        torque[j][1] += tj[1];
+        torque[j][2] += tj[2];
+
+        // particle/tri interaction = Nx1 particles
+        // c1,c2,c3 = corner pts of triangle J
+
+      } else if (tri[j] >= 0) {
+        npj = dnum[j];
+        jfirst = dfirst[j];
+
+        fi[0]=fi[1]=fi[2]=fj[0]=fj[1]=fj[2]=0.0;
+        ti[0]=ti[1]=ti[2]=tj[0]=tj[1]=tj[2]=0.0;
+
+        for (nj = 0; nj < npj; nj++) {
+          dxj = discrete[jfirst+nj].dx;
+          dyj = discrete[jfirst+nj].dy;
+          dzj = discrete[jfirst+nj].dz;
+
+          xi[0] = x[i][0];
+          xi[1] = x[i][1];
+          xi[2] = x[i][2];
+          xj[0] = x[j][0] + dxj;
+          xj[1] = x[j][1] + dyj;
+          xj[2] = x[j][2] + dzj;
+
+          delx = xi[0] - xj[0];
+          dely = xi[1] - xj[1];
+          delz = xi[2] - xj[2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          sig = 0.5 * (sigma[itype][itype]+discrete[jfirst+nj].sigma);
+          sig3 = sig*sig*sig;
+          term2 = 24.0*epsilon[itype][jtype] * sig3*sig3;
+          term1 = 2.0 * term2 * sig3*sig3;
+          r2inv = 1.0/rsq;
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (term1*r6inv - term2);
+          fpair = forcelj*r2inv;
+
+          if (EFLAG) evdwl += r6inv*(term1/12.0*r6inv-term2/6.0);
+
+          if (EFLAG) evdwl += r6inv*(term1/12.0*r6inv-term2/6.0);
+
+          fi[0] += delx*fpair;
+          fi[1] += dely*fpair;
+          fi[2] += delz*fpair;
+          ti[0] += fpair*(dyi*delz - dzi*dely);
+          ti[1] += fpair*(dzi*delx - dxi*delz);
+          ti[2] += fpair*(dxi*dely - dyi*delx);
+
+          if (NEWTON_PAIR || j < nlocal) {
+            fj[0] -= delx*fpair;
+            fj[1] -= dely*fpair;
+            fj[2] -= delz*fpair;
+            tj[0] -= fpair*(dyj*delz - dzj*dely);
+            tj[1] -= fpair*(dzj*delx - dxj*delz);
+            tj[2] -= fpair*(dxj*dely - dyj*delx);
+          }
+        }
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        f[j][0] += fj[0];
+        f[j][1] += fj[1];
+        f[j][2] += fj[2];
+        torque[i][0] += ti[0];
+        torque[i][1] += ti[1];
+        torque[i][2] += ti[2];
+        torque[j][0] += tj[0];
+        torque[j][1] += tj[1];
+        torque[j][2] += tj[2];
+
+        // particle/particle interaction = 1x1 particles
+
+      } else {
+        r2inv = 1.0/rsq;
+        r6inv = r2inv*r2inv*r2inv;
+        forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+        fpair = forcelj*r2inv;
+
+        if (EFLAG)
+          evdwl += r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+      }
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairTriLJOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairTriLJ::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_tri_lj_omp.h b/src/USER-OMP/pair_tri_lj_omp.h
new file mode 100644
index 0000000000..7c66fdcf44
--- /dev/null
+++ b/src/USER-OMP/pair_tri_lj_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(tri/lj/omp,PairTriLJOMP)
+
+#else
+
+#ifndef LMP_PAIR_TRI_LJ_OMP_H
+#define LMP_PAIR_TRI_LJ_OMP_H
+
+#include "pair_tri_lj.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairTriLJOMP : public PairTriLJ, public ThrOMP {
+
+ public:
+  PairTriLJOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_yukawa_colloid_omp.cpp b/src/USER-OMP/pair_yukawa_colloid_omp.cpp
new file mode 100644
index 0000000000..e66d1bb2bf
--- /dev/null
+++ b/src/USER-OMP/pair_yukawa_colloid_omp.cpp
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_yukawa_colloid_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairYukawaColloidOMP::PairYukawaColloidOMP(LAMMPS *lmp) :
+  PairYukawaColloid(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairYukawaColloidOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairYukawaColloidOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair,radi,radj;
+  double rsq,r,rinv,screening,forceyukawa,factor;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const double * const radius = atom->radius;
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    radi = radius[i];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      radj = radius[j];
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r = sqrt(rsq);
+        rinv = 1.0/r;
+        screening = exp(-kappa*(r-(radi+radj)));
+        forceyukawa = a[itype][jtype] * screening;
+
+        fpair = factor*forceyukawa * rinv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = a[itype][jtype]/kappa * screening - offset[itype][jtype];
+          evdwl *= factor;
+        }
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairYukawaColloidOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairYukawaColloid::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_yukawa_colloid_omp.h b/src/USER-OMP/pair_yukawa_colloid_omp.h
new file mode 100644
index 0000000000..b0ef483b5b
--- /dev/null
+++ b/src/USER-OMP/pair_yukawa_colloid_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(yukawa/colloid/omp,PairYukawaColloidOMP)
+
+#else
+
+#ifndef LMP_PAIR_YUKAWA_COLLOID_OMP_H
+#define LMP_PAIR_YUKAWA_COLLOID_OMP_H
+
+#include "pair_yukawa_colloid.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairYukawaColloidOMP : public PairYukawaColloid, public ThrOMP {
+
+ public:
+  PairYukawaColloidOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pair_yukawa_omp.cpp b/src/USER-OMP/pair_yukawa_omp.cpp
new file mode 100644
index 0000000000..3028901a9e
--- /dev/null
+++ b/src/USER-OMP/pair_yukawa_omp.cpp
@@ -0,0 +1,160 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_yukawa_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairYukawaOMP::PairYukawaOMP(LAMMPS *lmp) :
+  PairYukawa(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairYukawaOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairYukawaOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r,rinv,screening,forceyukawa,factor;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const double * const * const x = atom->x;
+  double * const * const f = thr->get_f();
+  const int * const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * const special_lj = force->special_lj;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+        rinv = 1.0/r;
+        screening = exp(-kappa*r);
+        forceyukawa = a[itype][jtype] * screening * (kappa + rinv);
+
+        fpair = factor*forceyukawa * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = a[itype][jtype] * screening * rinv - offset[itype][jtype];
+          evdwl *= factor;
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i][0] += fxtmp;
+    f[i][1] += fytmp;
+    f[i][2] += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairYukawaOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairYukawa::memory_usage();
+
+  return bytes;
+}
diff --git a/src/USER-OMP/pair_yukawa_omp.h b/src/USER-OMP/pair_yukawa_omp.h
new file mode 100644
index 0000000000..d9db213474
--- /dev/null
+++ b/src/USER-OMP/pair_yukawa_omp.h
@@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(yukawa/omp,PairYukawaOMP)
+
+#else
+
+#ifndef LMP_PAIR_YUKAWA_OMP_H
+#define LMP_PAIR_YUKAWA_OMP_H
+
+#include "pair_yukawa.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairYukawaOMP : public PairYukawa, public ThrOMP {
+
+ public:
+  PairYukawaOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pppm_cg_omp.cpp b/src/USER-OMP/pppm_cg_omp.cpp
new file mode 100644
index 0000000000..51fc048b3b
--- /dev/null
+++ b/src/USER-OMP/pppm_cg_omp.cpp
@@ -0,0 +1,610 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "pppm_cg_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "memory.h"
+#include "math_const.h"
+
+#include <string.h>
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALLQ 0.00001
+#if defined(FFT_SINGLE)
+#define ZEROF 0.0f
+#else
+#define ZEROF 0.0
+#endif
+
+#define EPS_HOC 1.0e-7
+
+/* ---------------------------------------------------------------------- */
+
+PPPMCGOMP::PPPMCGOMP(LAMMPS *lmp, int narg, char **arg) :
+  PPPMCG(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::allocate()
+{
+  PPPMCG::allocate();
+
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    FFT_SCALAR **rho1d_thr;
+    memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr");
+    ThrData *thr = fix->get_thr(tid);
+    thr->init_pppm(static_cast<void *>(rho1d_thr));
+  }
+
+  const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1;
+
+  // reallocate density brick, so it fits our needs
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_brick");
+}
+
+// NOTE: special version of reduce_data for FFT_SCALAR data type.
+// reduce per thread data into the first part of the data
+// array that is used for the non-threaded parts and reset
+// the temporary storage to 0.0. this routine depends on
+// multi-dimensional arrays like force stored in this order
+// x1,y1,z1,x2,y2,z2,...
+// we need to post a barrier to wait until all threads are done
+// writing to the array.
+static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
+{
+#if defined(_OPENMP)
+  // NOOP in non-threaded execution.
+  if (nthreads == 1) return;
+#pragma omp barrier
+  {
+    const int nvals = ndim*nall;
+    const int idelta = nvals/nthreads + 1;
+    const int ifrom = tid*idelta;
+    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
+
+    // this if protects against having more threads than atoms
+    if (ifrom < nall) {
+      for (int m = ifrom; m < ito; ++m) {
+        for (int n = 1; n < nthreads; ++n) {
+          dall[m] += dall[n*nvals + m];
+          dall[n*nvals + m] = 0.0;
+        }
+      }
+    }
+  }
+#else
+  // NOOP in non-threaded execution.
+  return;
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   free memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::deallocate()
+{
+  PPPMCG::deallocate();
+  for (int i=0; i < comm->nthreads; ++i) {
+    ThrData * thr = fix->get_thr(i);
+    FFT_SCALAR ** rho1d_thr = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    memory->destroy2d_offset(rho1d_thr,-order/2);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPMCG coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::setup()
+{
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  const double unitkx = (2.0*MY_PI/xprd);
+  const double unitky = (2.0*MY_PI/yprd);
+  const double unitkz = (2.0*MY_PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per;
+
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+        if (sqk == 0.0) {
+          vg[n][0] = 0.0;
+          vg[n][1] = 0.0;
+          vg[n][2] = 0.0;
+          vg[n][3] = 0.0;
+          vg[n][4] = 0.0;
+          vg[n][5] = 0.0;
+        } else {
+          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+          vg[n][3] = vterm*fkx[i]*fky[j];
+          vg[n][4] = vterm*fkx[i]*fkz[k];
+          vg[n][5] = vterm*fky[j]*fkz[k];
+        }
+        n++;
+      }
+    }
+  }
+
+  // modified (Hockney-Eastwood) Coulomb Green's function
+
+  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+
+  const double form = 1.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m;
+    double snx,sny,snz,sqk;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double sum1,dot1,dot2;
+    double numerator,denominator;
+    const double gew2 = -4.0*g_ewald*g_ewald;
+
+    const int nnx = nxhi_fft-nxlo_fft+1;
+    const int nny = nyhi_fft-nylo_fft+1;
+
+    loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);
+
+    for (m = nzlo_fft; m <= nzhi_fft; m++) {
+
+      const double fkzm = fkz[m];
+      snz = sin(0.5*fkzm*zprd_slab/nz_pppm);
+      snz *= snz;
+
+      for (l = nylo_fft; l <= nyhi_fft; l++) {
+        const double fkyl = fky[l];
+        sny = sin(0.5*fkyl*yprd/ny_pppm);
+        sny *= sny;
+
+        for (k = nxlo_fft; k <= nxhi_fft; k++) {
+
+          /* only compute the part designated to this thread */
+          nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
+          if ((nn < nnfrom) || (nn >=nnto)) continue;
+
+          const double fkxk = fkx[k];
+          snx = sin(0.5*fkxk*xprd/nx_pppm);
+          snx *= snx;
+
+          sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm;
+
+          if (sqk != 0.0) {
+            numerator = form*MY_4PI/sqk;
+            denominator = gf_denom(snx,sny,snz);
+            const double dorder = static_cast<double>(order);
+            sum1 = 0.0;
+            for (nx = -nbx; nx <= nbx; nx++) {
+              qx = fkxk + unitkx*nx_pppm*nx;
+              sx = exp(qx*qx/gew2);
+              wx = 1.0;
+              argx = 0.5*qx*xprd/nx_pppm;
+              if (argx != 0.0) wx = pow(sin(argx)/argx,dorder);
+              wx *=wx;
+
+              for (ny = -nby; ny <= nby; ny++) {
+                qy = fkyl + unitky*ny_pppm*ny;
+                sy = exp(qy*qy/gew2);
+                wy = 1.0;
+                argy = 0.5*qy*yprd/ny_pppm;
+                if (argy != 0.0) wy = pow(sin(argy)/argy,dorder);
+                wy *= wy;
+
+                for (nz = -nbz; nz <= nbz; nz++) {
+                  qz = fkzm + unitkz*nz_pppm*nz;
+                  sz = exp(qz*qz/gew2);
+                  wz = 1.0;
+                  argz = 0.5*qz*zprd_slab/nz_pppm;
+                  if (argz != 0.0) wz = pow(sin(argz)/argz,dorder);
+                  wz *= wz;
+
+                  dot1 = fkxk*qx + fkyl*qy + fkzm*qz;
+                  dot2 = qx*qx+qy*qy+qz*qz;
+                  sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+                }
+              }
+            }
+            greensfn[nn] = numerator*sum1/denominator;
+          } else greensfn[nn] = 0.0;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   run the regular toplevel compute method from plain PPPPMCG
+   which will have individual methods replaced by our threaded
+   versions and then call the obligatory force reduction.
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::compute(int eflag, int vflag)
+{
+
+  PPPMCG::compute(eflag,vflag);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::make_rho()
+{
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = num_charged;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ito = num_charged;
+#endif
+
+    int i,j,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+    // set up clear 3d density array
+    const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
+    FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
+    memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
+
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+    // (mx,my,mz) = global coords of moving stencil pt
+
+    // this if protects against having more threads than charged local atoms
+    if (ifrom < num_charged) {
+      for (j = ifrom; j < ito; j++) {
+        i = is_charged[j];
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        z0 = delvolinv * q[i];
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          y0 = z0*r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            x0 = y0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              db[mz][my][mx] += x0*r1d[0][l];
+            }
+          }
+        }
+      }
+    }
+#if defined(_OPENMP)
+    // reduce 3d density array
+    if (nthreads > 1) {
+      data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
+    }
+#endif
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+------------------------------------------------------------------------- */
+
+void PPPMCGOMP::fieldforce()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = num_charged;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = num_charged;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int i,j,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+
+    // this if protects against having more threads than charged local atoms
+    if (ifrom < num_charged) {
+      for (j = ifrom; j < ito; j++) {
+        i = is_charged[j];
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              ekx -= x0*vdx_brick[mz][my][mx];
+              eky -= x0*vdy_brick[mz][my][mx];
+              ekz -= x0*vdz_brick[mz][my][mx];
+            }
+          }
+        }
+
+        // convert E-field to force
+        const double qfactor = qqrd2e*scale*q[i];
+        f[i][0] += qfactor*ekx;
+        f[i][1] += qfactor*eky;
+        f[i][2] += qfactor*ekz;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial
+ ------------------------------------------------------------------------- */
+
+void PPPMCGOMP::fieldforce_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = num_charged;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = num_charged;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int i,j,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+    // this if protects against having more threads than charged local atoms
+    if (ifrom < num_charged) {
+      for (j = ifrom; j < ito; j++) {
+        i = is_charged[j];
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              if (eflag_atom) u += x0*u_brick[mz][my][mx];
+              if (vflag_atom) {
+                v0 += x0*v0_brick[mz][my][mx];
+                v1 += x0*v1_brick[mz][my][mx];
+                v2 += x0*v2_brick[mz][my][mx];
+                v3 += x0*v3_brick[mz][my][mx];
+                v4 += x0*v4_brick[mz][my][mx];
+                v5 += x0*v5_brick[mz][my][mx];
+              }
+            }
+          }
+        }
+
+        if (eflag_atom) eatom[i] += q[i]*u;
+        if (vflag_atom) {
+          vatom[i][0] += v0;
+          vatom[i][1] += v1;
+          vatom[i][2] += v2;
+          vatom[i][3] += v3;
+          vatom[i][4] += v4;
+          vatom[i][5] += v5;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
+                                  const FFT_SCALAR &dy, const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-1; l >= 0; l--) {
+      r1 = rho_coeff[l][k] + r1*dx;
+      r2 = rho_coeff[l][k] + r2*dy;
+      r3 = rho_coeff[l][k] + r3*dz;
+    }
+    r1d[0][k] = r1;
+    r1d[1][k] = r2;
+    r1d[2][k] = r3;
+  }
+}
diff --git a/src/USER-OMP/pppm_cg_omp.h b/src/USER-OMP/pppm_cg_omp.h
new file mode 100644
index 0000000000..10a9d35f29
--- /dev/null
+++ b/src/USER-OMP/pppm_cg_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/cg/omp,PPPMCGOMP)
+
+#else
+
+#ifndef LMP_PPPM_CG_OMP_H
+#define LMP_PPPM_CG_OMP_H
+
+#include "pppm_cg.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMCGOMP : public PPPMCG, public ThrOMP {
+ public:
+  PPPMCGOMP(class LAMMPS *, int, char **);
+  virtual void compute(int, int);
+  virtual void setup();
+  virtual ~PPPMCGOMP () {};
+
+ protected:
+  virtual void allocate();
+  virtual void deallocate();
+  virtual void fieldforce();
+  virtual void fieldforce_peratom();
+  virtual void make_rho();
+
+  void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+                         const FFT_SCALAR &, const FFT_SCALAR &);
+//  void compute_rho_coeff();
+//  void slabcorr(int);
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pppm_omp.cpp b/src/USER-OMP/pppm_omp.cpp
new file mode 100644
index 0000000000..5c400b6f48
--- /dev/null
+++ b/src/USER-OMP/pppm_omp.cpp
@@ -0,0 +1,611 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "pppm_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "memory.h"
+#include "math_const.h"
+
+#include <string.h>
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+#define EPS_HOC 1.0e-7
+
+/* ---------------------------------------------------------------------- */
+
+PPPMOMP::PPPMOMP(LAMMPS *lmp, int narg, char **arg) :
+  PPPM(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMOMP::allocate()
+{
+  PPPM::allocate();
+
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    FFT_SCALAR **rho1d_thr;
+    memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr");
+    ThrData *thr = fix->get_thr(tid);
+    thr->init_pppm(static_cast<void *>(rho1d_thr));
+  }
+
+  const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1;
+
+  // reallocate density brick, so it fits our needs
+  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
+  memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm:density_brick");
+}
+
+// NOTE: special version of reduce_data for FFT_SCALAR data type.
+// reduce per thread data into the first part of the data
+// array that is used for the non-threaded parts and reset
+// the temporary storage to 0.0. this routine depends on
+// multi-dimensional arrays like force stored in this order
+// x1,y1,z1,x2,y2,z2,...
+// we need to post a barrier to wait until all threads are done
+// writing to the array.
+static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
+{
+#if defined(_OPENMP)
+  // NOOP in non-threaded execution.
+  if (nthreads == 1) return;
+#pragma omp barrier
+  {
+    const int nvals = ndim*nall;
+    const int idelta = nvals/nthreads + 1;
+    const int ifrom = tid*idelta;
+    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
+
+    // this if protects against having more threads than atoms
+    if (ifrom < nall) {
+      for (int m = ifrom; m < ito; ++m) {
+        for (int n = 1; n < nthreads; ++n) {
+          dall[m] += dall[n*nvals + m];
+          dall[n*nvals + m] = 0.0;
+        }
+      }
+    }
+  }
+#else
+  // NOOP in non-threaded execution.
+  return;
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   free memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMOMP::deallocate()
+{
+  PPPM::deallocate();
+  for (int i=0; i < comm->nthreads; ++i) {
+    ThrData * thr = fix->get_thr(i);
+    FFT_SCALAR ** rho1d_thr = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    memory->destroy2d_offset(rho1d_thr,-order/2);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void PPPMOMP::setup()
+{
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  const double unitkx = (2.0*MY_PI/xprd);
+  const double unitky = (2.0*MY_PI/yprd);
+  const double unitkz = (2.0*MY_PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per;
+
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+        if (sqk == 0.0) {
+          vg[n][0] = 0.0;
+          vg[n][1] = 0.0;
+          vg[n][2] = 0.0;
+          vg[n][3] = 0.0;
+          vg[n][4] = 0.0;
+          vg[n][5] = 0.0;
+        } else {
+          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+          vg[n][3] = vterm*fkx[i]*fky[j];
+          vg[n][4] = vterm*fkx[i]*fkz[k];
+          vg[n][5] = vterm*fky[j]*fkz[k];
+        }
+        n++;
+      }
+    }
+  }
+
+  // modified (Hockney-Eastwood) Coulomb Green's function
+
+  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                                    pow(-log(EPS_HOC),0.25));
+
+  const double form = 1.0;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m;
+    double snx,sny,snz,sqk;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double sum1,dot1,dot2;
+    double numerator,denominator;
+    const double gew2 = -4.0*g_ewald*g_ewald;
+
+    const int nnx = nxhi_fft-nxlo_fft+1;
+    const int nny = nyhi_fft-nylo_fft+1;
+
+    loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);
+
+    for (m = nzlo_fft; m <= nzhi_fft; m++) {
+
+      const double fkzm = fkz[m];
+      snz = sin(0.5*fkzm*zprd_slab/nz_pppm);
+      snz *= snz;
+
+      for (l = nylo_fft; l <= nyhi_fft; l++) {
+        const double fkyl = fky[l];
+        sny = sin(0.5*fkyl*yprd/ny_pppm);
+        sny *= sny;
+
+        for (k = nxlo_fft; k <= nxhi_fft; k++) {
+
+          /* only compute the part designated to this thread */
+          nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
+          if ((nn < nnfrom) || (nn >=nnto)) continue;
+
+          const double fkxk = fkx[k];
+          snx = sin(0.5*fkxk*xprd/nx_pppm);
+          snx *= snx;
+
+          sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm;
+
+          if (sqk != 0.0) {
+            numerator = form*MY_4PI/sqk;
+            denominator = gf_denom(snx,sny,snz);
+            sum1 = 0.0;
+            const double dorder = static_cast<double>(order);
+            for (nx = -nbx; nx <= nbx; nx++) {
+              qx = fkxk + unitkx*nx_pppm*nx;
+              sx = exp(qx*qx/gew2);
+              wx = 1.0;
+              argx = 0.5*qx*xprd/nx_pppm;
+              if (argx != 0.0) wx = pow(sin(argx)/argx,dorder);
+              wx *=wx;
+
+              for (ny = -nby; ny <= nby; ny++) {
+                qy = fkyl + unitky*ny_pppm*ny;
+                sy = exp(qy*qy/gew2);
+                wy = 1.0;
+                argy = 0.5*qy*yprd/ny_pppm;
+                if (argy != 0.0) wy = pow(sin(argy)/argy,dorder);
+                wy *= wy;
+
+                for (nz = -nbz; nz <= nbz; nz++) {
+                  qz = fkzm + unitkz*nz_pppm*nz;
+                  sz = exp(qz*qz/gew2);
+                  wz = 1.0;
+                  argz = 0.5*qz*zprd_slab/nz_pppm;
+                  if (argz != 0.0) wz = pow(sin(argz)/argz,dorder);
+                  wz *= wz;
+
+                  dot1 = fkxk*qx + fkyl*qy + fkzm*qz;
+                  dot2 = qx*qx+qy*qy+qz*qz;
+                  sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+                }
+              }
+            }
+            greensfn[nn] = numerator*sum1/denominator;
+          } else greensfn[nn] = 0.0;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   run the regular toplevel compute method from plain PPPPM
+   which will have individual methods replaced by our threaded
+   versions and then call the obligatory force reduction.
+------------------------------------------------------------------------- */
+
+void PPPMOMP::compute(int eflag, int vflag)
+{
+
+  PPPM::compute(eflag,vflag);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMOMP::make_rho()
+{
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ito = nlocal;
+#endif
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+    // set up clear 3d density array
+    const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
+    FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
+    memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
+
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+    // (mx,my,mz) = global coords of moving stencil pt
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        z0 = delvolinv * q[i];
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          y0 = z0*r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            x0 = y0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              db[mz][my][mx] += x0*r1d[0][l];
+            }
+          }
+        }
+      }
+    }
+#if defined(_OPENMP)
+    // reduce 3d density array
+    if (nthreads > 1) {
+      data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
+    }
+#endif
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+------------------------------------------------------------------------- */
+
+void PPPMOMP::fieldforce()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              ekx -= x0*vdx_brick[mz][my][mx];
+              eky -= x0*vdy_brick[mz][my][mx];
+              ekz -= x0*vdz_brick[mz][my][mx];
+            }
+          }
+        }
+
+        // convert E-field to force
+        const double qfactor = qqrd2e*scale*q[i];
+        f[i][0] += qfactor*ekx;
+        f[i][1] += qfactor*eky;
+        f[i][2] += qfactor*ekz;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial
+ ------------------------------------------------------------------------- */
+
+void PPPMOMP::fieldforce_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              if (eflag_atom) u += x0*u_brick[mz][my][mx];
+              if (vflag_atom) {
+                v0 += x0*v0_brick[mz][my][mx];
+                v1 += x0*v1_brick[mz][my][mx];
+                v2 += x0*v2_brick[mz][my][mx];
+                v3 += x0*v3_brick[mz][my][mx];
+                v4 += x0*v4_brick[mz][my][mx];
+                v5 += x0*v5_brick[mz][my][mx];
+              }
+            }
+          }
+        }
+
+        if (eflag_atom) eatom[i] += q[i]*u;
+        if (vflag_atom) {
+          vatom[i][0] += v0;
+          vatom[i][1] += v1;
+          vatom[i][2] += v2;
+          vatom[i][3] += v3;
+          vatom[i][4] += v4;
+          vatom[i][5] += v5;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+void PPPMOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
+                                const FFT_SCALAR &dy, const FFT_SCALAR &dz)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-1; l >= 0; l--) {
+      r1 = rho_coeff[l][k] + r1*dx;
+      r2 = rho_coeff[l][k] + r2*dy;
+      r3 = rho_coeff[l][k] + r3*dz;
+    }
+    r1d[0][k] = r1;
+    r1d[1][k] = r2;
+    r1d[2][k] = r3;
+  }
+}
diff --git a/src/USER-OMP/pppm_omp.h b/src/USER-OMP/pppm_omp.h
new file mode 100644
index 0000000000..b3c069844c
--- /dev/null
+++ b/src/USER-OMP/pppm_omp.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/omp,PPPMOMP)
+
+#else
+
+#ifndef LMP_PPPM_OMP_H
+#define LMP_PPPM_OMP_H
+
+#include "pppm.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMOMP : public PPPM, public ThrOMP {
+ public:
+  PPPMOMP(class LAMMPS *, int, char **);
+  virtual ~PPPMOMP () {};
+  virtual void setup();
+  virtual void compute(int, int);
+
+ protected:
+  virtual void allocate();
+  virtual void deallocate();
+  virtual void fieldforce();
+  virtual void fieldforce_peratom();
+  virtual void make_rho();
+
+  void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+                         const FFT_SCALAR &, const FFT_SCALAR &);
+//  void compute_rho_coeff();
+//  void slabcorr(int);
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pppm_proxy.cpp b/src/USER-OMP/pppm_proxy.cpp
new file mode 100644
index 0000000000..25c2158a08
--- /dev/null
+++ b/src/USER-OMP/pppm_proxy.cpp
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "pppm_proxy.h"
+
+#include <stdio.h>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PPPMProxy::PPPMProxy(LAMMPS *lmp, int narg, char **arg) :
+  PPPM(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE|THR_PROXY) { need_setup=1; }
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMProxy::setup_proxy()
+{
+  if (need_setup)
+    PPPM::setup();
+
+  need_setup = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMProxy::compute(int eflag, int vflag)
+{
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag,vflag,thr);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMProxy::compute_proxy(int eflag, int vflag)
+{
+  setup_proxy();
+  PPPM::compute(eflag,vflag);
+}
diff --git a/src/USER-OMP/pppm_proxy.h b/src/USER-OMP/pppm_proxy.h
new file mode 100644
index 0000000000..110abfc9da
--- /dev/null
+++ b/src/USER-OMP/pppm_proxy.h
@@ -0,0 +1,47 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/proxy,PPPMProxy)
+
+#else
+
+#ifndef LMP_PPPM_PROXY_H
+#define LMP_PPPM_PROXY_H
+
+#include "pppm.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMProxy : public PPPM, public ThrOMP {
+ public:
+  PPPMProxy(class LAMMPS *, int, char **);
+  virtual ~PPPMProxy () {};
+
+  virtual void compute(int, int);
+  virtual void compute_proxy(int eflag, int vflag);
+
+  // setup is delayed until the compute proxy is called
+  virtual void setup() { need_setup=1; };
+  virtual void setup_proxy();
+
+ protected:
+  int need_setup;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pppm_tip4p_omp.cpp b/src/USER-OMP/pppm_tip4p_omp.cpp
new file mode 100644
index 0000000000..e59275239b
--- /dev/null
+++ b/src/USER-OMP/pppm_tip4p_omp.cpp
@@ -0,0 +1,402 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "pppm_tip4p_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "memory.h"
+#include "math_const.h"
+
+#include <string.h>
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define OFFSET 16384
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMTIP4POMP::PPPMTIP4POMP(LAMMPS *lmp, int narg, char **arg) :
+  PPPMOMP(lmp, narg, arg)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ---------------------------------------------------------------------- */
+
+// NOTE: special version of reduce_data for FFT_SCALAR data type.
+// reduce per thread data into the first part of the data
+// array that is used for the non-threaded parts and reset
+// the temporary storage to 0.0. this routine depends on
+// multi-dimensional arrays like force stored in this order
+// x1,y1,z1,x2,y2,z2,...
+// we need to post a barrier to wait until all threads are done
+// writing to the array.
+static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
+{
+#if defined(_OPENMP)
+  // NOOP in non-threaded execution.
+  if (nthreads == 1) return;
+#pragma omp barrier
+  {
+    const int nvals = ndim*nall;
+    const int idelta = nvals/nthreads + 1;
+    const int ifrom = tid*idelta;
+    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
+
+    // this if protects against having more threads than atoms
+    if (ifrom < nall) {
+      for (int m = ifrom; m < ito; ++m) {
+        for (int n = 1; n < nthreads; ++n) {
+          dall[m] += dall[n*nvals + m];
+          dall[n*nvals + m] = 0.0;
+        }
+      }
+    }
+  }
+#else
+  // NOOP in non-threaded execution.
+  return;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::init()
+{
+  // TIP4P PPPM requires newton on, b/c it computes forces on ghost atoms
+
+  if (force->newton == 0)
+    error->all(FLERR,"Kspace style pppm/tip4p/omp requires newton on");
+
+  PPPMOMP::init();
+}
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::particle_map()
+{
+  const int * const type = atom->type;
+  const double * const * const x = atom->x;
+  const int nlocal = atom->nlocal;
+
+  int i, flag = 0;
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static)
+#endif
+  for (i = 0; i < nlocal; i++) {
+    int nx,ny,nz,iH1,iH2;
+    double xM[3];
+
+    if (type[i] == typeO) {
+      find_M(i,iH1,iH2,xM);
+    } else {
+      xM[0] = x[i][0];
+      xM[1] = x[i][1];
+      xM[2] = x[i][2];
+    }
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    nx = static_cast<int> ((xM[0]-boxlo[0])*delxinv+shift) - OFFSET;
+    ny = static_cast<int> ((xM[1]-boxlo[1])*delyinv+shift) - OFFSET;
+    nz = static_cast<int> ((xM[2]-boxlo[2])*delzinv+shift) - OFFSET;
+
+    part2grid[i][0] = nx;
+    part2grid[i][1] = ny;
+    part2grid[i][2] = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag++;
+  }
+
+  int flag_all;
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPM");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::make_rho()
+{
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int * const type = atom->type;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ito = nlocal;
+#endif
+
+    int l,m,n,nx,ny,nz,mx,my,mz,iH1,iH2;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    double xM[3];
+
+    // set up clear 3d density array
+    const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
+    FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
+    memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
+
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+    // (mx,my,mz) = global coords of moving stencil pt
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        if (type[i] == typeO) {
+          find_M(i,iH1,iH2,xM);
+        } else {
+          xM[0] = x[i][0];
+          xM[1] = x[i][1];
+          xM[2] = x[i][2];
+        }
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        z0 = delvolinv * q[i];
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          y0 = z0*r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            x0 = y0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              db[mz][my][mx] += x0*r1d[0][l];
+            }
+          }
+        }
+      }
+    }
+#if defined(_OPENMP)
+    // reduce 3d density array
+    if (nthreads > 1) {
+      data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
+    }
+#endif
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::fieldforce()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int * const type = atom->type;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+    int iH1,iH2;
+    double xM[3], fx,fy,fz;
+    double ddotf, rOMx, rOMy, rOMz, f1x, f1y, f1z;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        if (type[i] == typeO) {
+          find_M(i,iH1,iH2,xM);
+        } else {
+          xM[0] = x[i][0];
+          xM[1] = x[i][1];
+          xM[2] = x[i][2];
+        }
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              ekx -= x0*vdx_brick[mz][my][mx];
+              eky -= x0*vdy_brick[mz][my][mx];
+              ekz -= x0*vdz_brick[mz][my][mx];
+            }
+          }
+        }
+
+        // convert E-field to force
+        const double qfactor = qqrd2e*scale*q[i];
+
+        if (type[i] != typeO) {
+          f[i][0] += qfactor*ekx;
+          f[i][1] += qfactor*eky;
+          f[i][2] += qfactor*ekz;
+
+        } else {
+          fx = qfactor * ekx;
+          fy = qfactor * eky;
+          fz = qfactor * ekz;
+          find_M(i,iH1,iH2,xM);
+
+          rOMx = xM[0] - x[i][0];
+          rOMy = xM[1] - x[i][1];
+          rOMz = xM[2] - x[i][2];
+
+          ddotf = (rOMx * fx + rOMy * fy + rOMz * fz) / (qdist * qdist);
+
+          f1x = ddotf * rOMx;
+          f1y = ddotf * rOMy;
+          f1z = ddotf * rOMz;
+
+          f[i][0] += fx - alpha * (fx - f1x);
+          f[i][1] += fy - alpha * (fy - f1y);
+          f[i][2] += fz - alpha * (fz - f1z);
+
+          f[iH1][0] += 0.5*alpha*(fx - f1x);
+          f[iH1][1] += 0.5*alpha*(fy - f1y);
+          f[iH1][2] += 0.5*alpha*(fz - f1z);
+
+          f[iH2][0] += 0.5*alpha*(fx - f1x);
+          f[iH2][1] += 0.5*alpha*(fy - f1y);
+          f[iH2][2] += 0.5*alpha*(fz - f1z);
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+  find 2 H atoms bonded to O atom i
+  compute position xM of fictitious charge site for O atom
+  also return local indices iH1,iH2 of H atoms
+------------------------------------------------------------------------- */
+
+void PPPMTIP4POMP::find_M(int i, int &iH1, int &iH2, double *xM)
+{
+  iH1 = atom->map(atom->tag[i] + 1);
+  iH2 = atom->map(atom->tag[i] + 2);
+
+  if (iH1 == -1 || iH2 == -1) error->one(FLERR,"TIP4P hydrogen is missing");
+  if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
+    error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
+
+  const double * const * const x = atom->x;
+
+  double delx1 = x[iH1][0] - x[i][0];
+  double dely1 = x[iH1][1] - x[i][1];
+  double delz1 = x[iH1][2] - x[i][2];
+  domain->minimum_image(delx1,dely1,delz1);
+
+  double delx2 = x[iH2][0] - x[i][0];
+  double dely2 = x[iH2][1] - x[i][1];
+  double delz2 = x[iH2][2] - x[i][2];
+  domain->minimum_image(delx2,dely2,delz2);
+
+  xM[0] = x[i][0] + alpha * 0.5 * (delx1 + delx2);
+  xM[1] = x[i][1] + alpha * 0.5 * (dely1 + dely2);
+  xM[2] = x[i][2] + alpha * 0.5 * (delz1 + delz2);
+}
diff --git a/src/USER-OMP/pppm_tip4p_omp.h b/src/USER-OMP/pppm_tip4p_omp.h
new file mode 100644
index 0000000000..11801ccb35
--- /dev/null
+++ b/src/USER-OMP/pppm_tip4p_omp.h
@@ -0,0 +1,56 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/tip4p/omp,PPPMTIP4POMP)
+
+#else
+
+#ifndef LMP_PPPM_TIP4P_OMP_H
+#define LMP_PPPM_TIP4P_OMP_H
+
+#include "pppm_omp.h"
+
+namespace LAMMPS_NS {
+
+class PPPMTIP4POMP : public PPPMOMP {
+ public:
+  PPPMTIP4POMP(class LAMMPS *, int, char **);
+  virtual ~PPPMTIP4POMP () {};
+  virtual void init();
+
+ protected:
+  virtual void particle_map();
+  virtual void fieldforce();
+  virtual void make_rho();
+
+ private:
+  void find_M(int, int &, int &, double *);
+
+//  void slabcorr(int);
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Kspace style pppm/tip4p/omp requires newton on
+
+Self-explanatory.
+
+*/
diff --git a/src/USER-OMP/pppm_tip4p_proxy.cpp b/src/USER-OMP/pppm_tip4p_proxy.cpp
new file mode 100644
index 0000000000..db851dec5b
--- /dev/null
+++ b/src/USER-OMP/pppm_tip4p_proxy.cpp
@@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "pppm_tip4p_proxy.h"
+
+#include <stdio.h>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PPPMTIP4PProxy::PPPMTIP4PProxy(LAMMPS *lmp, int narg, char **arg) :
+  PPPMTIP4P(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE|THR_PROXY) { need_setup=1; }
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMTIP4PProxy::setup_proxy()
+{
+  if (need_setup)
+    PPPMTIP4P::setup();
+
+  need_setup=0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMTIP4PProxy::compute(int eflag, int vflag)
+{
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag,vflag,thr);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PPPMTIP4PProxy::compute_proxy(int eflag, int vflag)
+{
+  setup_proxy();
+  PPPMTIP4P::compute(eflag,vflag);
+}
diff --git a/src/USER-OMP/pppm_tip4p_proxy.h b/src/USER-OMP/pppm_tip4p_proxy.h
new file mode 100644
index 0000000000..0a670d8b63
--- /dev/null
+++ b/src/USER-OMP/pppm_tip4p_proxy.h
@@ -0,0 +1,47 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/tip4p/proxy,PPPMTIP4PProxy)
+
+#else
+
+#ifndef LMP_PPPM_TIP4P_PROXY_H
+#define LMP_PPPM_TIP4P_PROXY_H
+
+#include "pppm_tip4p.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMTIP4PProxy : public PPPMTIP4P, public ThrOMP {
+ public:
+  PPPMTIP4PProxy(class LAMMPS *, int, char **);
+  virtual ~PPPMTIP4PProxy () {};
+
+  virtual void compute(int, int);
+  virtual void compute_proxy(int eflag, int vflag);
+
+  // setup is delayed until the compute_proxy is called
+  virtual void setup() { need_setup=1; };
+  virtual void setup_proxy();
+
+ protected:
+  int need_setup;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/thr_data.cpp b/src/USER-OMP/thr_data.cpp
new file mode 100644
index 0000000000..cebed27721
--- /dev/null
+++ b/src/USER-OMP/thr_data.cpp
@@ -0,0 +1,231 @@
+/* -------------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   per-thread data management for LAMMPS
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "thr_data.h"
+
+#include <string.h>
+#include <stdio.h>
+
+using namespace LAMMPS_NS;
+
+
+ThrData::ThrData(int tid)
+  : _f(NULL), _torque(NULL), _erforce(NULL), _de(NULL), _drho(NULL), _mu(NULL),
+    _lambda(NULL), _rhoB(NULL), _D_values(NULL), _rho(NULL), _fp(NULL),
+    _rho1d(NULL), _tid(tid)
+{
+  // nothing else to do here.
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void ThrData::check_tid(int tid)
+{
+  if (tid != _tid)
+    fprintf(stderr,"WARNING: external and internal tid mismatch %d != %d\n",tid,_tid);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ThrData::init_force(int nall, double **f, double **torque,
+                         double *erforce, double *de, double *drho)
+{
+  eng_vdwl=eng_coul=eng_bond=eng_angle=eng_dihed=eng_imprp=eng_kspce=0.0;
+  memset(virial_pair,0,6*sizeof(double));
+  memset(virial_bond,0,6*sizeof(double));
+  memset(virial_angle,0,6*sizeof(double));
+  memset(virial_dihed,0,6*sizeof(double));
+  memset(virial_imprp,0,6*sizeof(double));
+  memset(virial_kspce,0,6*sizeof(double));
+
+  eatom_pair=eatom_bond=eatom_angle=eatom_dihed=eatom_imprp=eatom_kspce=NULL;
+  vatom_pair=vatom_bond=vatom_angle=vatom_dihed=vatom_imprp=vatom_kspce=NULL;
+
+  _f = f + _tid*nall;
+  if (nall > 0)
+    memset(&(_f[0][0]),0,nall*3*sizeof(double));
+
+  if (torque) {
+    _torque = torque + _tid*nall;
+    if (nall > 0)
+      memset(&(_torque[0][0]),0,nall*3*sizeof(double));
+  } else _torque = NULL;
+
+  if (erforce) {
+    _erforce = erforce + _tid*nall;
+    if (nall > 0)
+      memset(&(_erforce[0]),0,nall*sizeof(double));
+  } else _erforce = NULL;
+
+  if (de) {
+    _de = de + _tid*nall;
+    if (nall > 0)
+      memset(&(_de[0]),0,nall*sizeof(double));
+  } else _de = NULL;
+
+  if (drho) {
+    _drho = drho + _tid*nall;
+    if (nall > 0)
+      memset(&(_drho[0]),0,nall*sizeof(double));
+  } else _drho = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   set up and clear out locally managed per atom arrays
+------------------------------------------------------------------------- */
+
+void ThrData::init_eam(int nall, double *rho)
+{
+  _rho = rho + _tid*nall;
+  if (nall > 0)
+    memset(_rho, 0, nall*sizeof(double));
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ThrData::init_adp(int nall, double *rho, double **mu, double **lambda)
+{
+  init_eam(nall, rho);
+
+  _mu = mu + _tid*nall;
+  _lambda = lambda + _tid*nall;
+  if (nall > 0) {
+    memset(&(_mu[0][0]), 0, nall*3*sizeof(double));
+    memset(&(_lambda[0][0]), 0, nall*6*sizeof(double));
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ThrData::init_cdeam(int nall, double *rho, double *rhoB, double *D_values)
+{
+  init_eam(nall, rho);
+
+  _rhoB = rhoB + _tid*nall;
+  _D_values = D_values + _tid*nall;
+  if (nall > 0) {
+    memset(_rhoB, 0, nall*sizeof(double));
+    memset(_D_values, 0, nall*sizeof(double));
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ThrData::init_eim(int nall, double *rho, double *fp)
+{
+  init_eam(nall, rho);
+
+  _fp = fp + _tid*nall;
+  if (nall > 0)
+    memset(_fp,0,nall*sizeof(double));
+}
+
+/* ----------------------------------------------------------------------
+   compute global pair virial via summing F dot r over own & ghost atoms
+   at this point, only pairwise forces have been accumulated in atom->f
+------------------------------------------------------------------------- */
+
+void ThrData::virial_fdotr_compute(double **x, int nlocal, int nghost, int nfirst)
+{
+
+  // sum over force on all particles including ghosts
+
+  if (nfirst < 0) {
+    int nall = nlocal + nghost;
+    for (int i = 0; i < nall; i++) {
+      virial_pair[0] += _f[i][0]*x[i][0];
+      virial_pair[1] += _f[i][1]*x[i][1];
+      virial_pair[2] += _f[i][2]*x[i][2];
+      virial_pair[3] += _f[i][1]*x[i][0];
+      virial_pair[4] += _f[i][2]*x[i][0];
+      virial_pair[5] += _f[i][2]*x[i][1];
+    }
+
+  // neighbor includegroup flag is set
+  // sum over force on initial nfirst particles and ghosts
+
+  } else {
+    int nall = nfirst;
+    for (int i = 0; i < nall; i++) {
+      virial_pair[0] += _f[i][0]*x[i][0];
+      virial_pair[1] += _f[i][1]*x[i][1];
+      virial_pair[2] += _f[i][2]*x[i][2];
+      virial_pair[3] += _f[i][1]*x[i][0];
+      virial_pair[4] += _f[i][2]*x[i][0];
+      virial_pair[5] += _f[i][2]*x[i][1];
+    }
+    nall = nlocal + nghost;
+    for (int i = nlocal; i < nall; i++) {
+      virial_pair[0] += _f[i][0]*x[i][0];
+      virial_pair[1] += _f[i][1]*x[i][1];
+      virial_pair[2] += _f[i][2]*x[i][2];
+      virial_pair[3] += _f[i][1]*x[i][0];
+      virial_pair[4] += _f[i][2]*x[i][0];
+      virial_pair[5] += _f[i][2]*x[i][1];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ThrData::memory_usage()
+{
+  double bytes = (7 + 6*6) * sizeof(double);
+  bytes += 2 * sizeof(double*);
+  bytes += 4 * sizeof(int);
+
+  return bytes;
+}
+
+/* additional helper functions */
+
+// reduce per thread data into the first part of the data
+// array that is used for the non-threaded parts and reset
+// the temporary storage to 0.0. this routine depends on
+// multi-dimensional arrays like force stored in this order
+// x1,y1,z1,x2,y2,z2,...
+// we need to post a barrier to wait until all threads are done
+// with writing to the array .
+void LAMMPS_NS::data_reduce_thr(double *dall, int nall, int nthreads, int ndim, int tid)
+{
+#if defined(_OPENMP)
+  // NOOP in non-threaded execution.
+  if (nthreads == 1) return;
+#pragma omp barrier
+  {
+    const int nvals = ndim*nall;
+    const int idelta = nvals/nthreads + 1;
+    const int ifrom = tid*idelta;
+    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
+
+    // this if protects against having more threads than atoms
+    if (ifrom < nvals) {
+      for (int m = ifrom; m < ito; ++m) {
+        for (int n = 1; n < nthreads; ++n) {
+          dall[m] += dall[n*nvals + m];
+          dall[n*nvals + m] = 0.0;
+        }
+      }
+    }
+  }
+#else
+  // NOOP in non-threaded execution.
+  return;
+#endif
+}
diff --git a/src/USER-OMP/thr_data.h b/src/USER-OMP/thr_data.h
new file mode 100644
index 0000000000..6345a203c7
--- /dev/null
+++ b/src/USER-OMP/thr_data.h
@@ -0,0 +1,132 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_THR_DATA_H
+#define LMP_THR_DATA_H
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+namespace LAMMPS_NS {
+
+// per thread data accumulators
+// there should be one instance
+// of this class for each thread.
+class ThrData {
+  friend class FixOMP;
+  friend class ThrOMP;
+
+ public:
+  ThrData(int tid);
+  ~ThrData() {};
+
+  void check_tid(int);    // thread id consistency check
+  int get_tid() const { return _tid; }; // our thread id.
+
+  // erase accumulator contents and hook up force arrays
+  void init_force(int, double **, double **, double *, double *, double *);
+
+  // give access to per-thread offset arrays
+  double **get_f() const { return _f; };
+  double **get_torque() const { return _torque; };
+  double *get_de() const { return _de; };
+  double *get_drho() const { return _drho; };
+
+  // setup and erase per atom arrays
+  void init_adp(int, double *, double **, double **); // ADP (+ EAM)
+  void init_cdeam(int, double *, double *, double *); // CDEAM (+ EAM)
+  void init_eam(int, double *);                       // EAM
+  void init_eim(int, double *, double *);             // EIM (+ EAM)
+
+  void init_pppm(void *r1d) { _rho1d = r1d; };
+
+  // access methods for arrays that we handle in this class
+  double **get_lambda() const { return _lambda; };
+  double **get_mu() const { return _mu; };
+  double *get_D_values() const { return _D_values; };
+  double *get_fp() const { return _fp; };
+  double *get_rho() const { return _rho; };
+  double *get_rhoB() const { return _rhoB; };
+  void *get_rho1d() const { return _rho1d; };
+
+ private:
+  double eng_vdwl;        // non-bonded non-coulomb energy
+  double eng_coul;        // non-bonded coulomb energy
+  double eng_bond;        // bond energy
+  double eng_angle;       // angle energy
+  double eng_dihed;       // dihedral energy
+  double eng_imprp;       // improper energy
+  double eng_kspce;       // kspace energy
+  double virial_pair[6];  // virial contribution from non-bonded
+  double virial_bond[6];  // virial contribution from bonds
+  double virial_angle[6]; // virial contribution from angles
+  double virial_dihed[6]; // virial contribution from dihedrals
+  double virial_imprp[6]; // virial contribution from impropers
+  double virial_kspce[6]; // virial contribution from kspace
+  double *eatom_pair;
+  double *eatom_bond;
+  double *eatom_angle;
+  double *eatom_dihed;
+  double *eatom_imprp;
+  double *eatom_kspce;
+  double **vatom_pair;
+  double **vatom_bond;
+  double **vatom_angle;
+  double **vatom_dihed;
+  double **vatom_imprp;
+  double **vatom_kspce;
+
+  // per thread segments of various force or similar arrays
+
+  // these are maintained by atom styles
+  double **_f;
+  double **_torque;
+  double *_erforce;
+  double *_de;
+  double *_drho;
+
+  // these are maintained by individual pair styles
+  double **_mu, **_lambda;   // ADP (+ EAM)
+  double *_rhoB, *_D_values; // CDEAM (+ EAM)
+  double *_rho;              // EAM
+  double *_fp;               // EIM (+ EAM)
+
+  // this is for pppm/omp
+  void *_rho1d;
+
+  // my thread id
+  const int _tid;
+
+ public:
+  // compute global per thread virial contribution from global forces and positions
+  void virial_fdotr_compute(double **, int, int, int);
+
+  double memory_usage();
+
+ // disabled default methods
+ private:
+  ThrData() : _tid(-1) {};
+};
+
+////////////////////////////////////////////////////////////////////////
+//  helper functions operating on data replicated for thread support  //
+////////////////////////////////////////////////////////////////////////
+// generic per thread data reduction for continous arrays of nthreads*nmax size
+void data_reduce_thr(double *, int, int, int, int);
+}
+#endif
diff --git a/src/USER-OMP/thr_omp.cpp b/src/USER-OMP/thr_omp.cpp
new file mode 100644
index 0000000000..f42b4d327f
--- /dev/null
+++ b/src/USER-OMP/thr_omp.cpp
@@ -0,0 +1,1195 @@
+/* -------------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   OpenMP based threading support for LAMMPS
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+
+#include "thr_omp.h"
+
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+
+#include "math_const.h"
+
+#include <string.h>
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+ThrOMP::ThrOMP(LAMMPS *ptr, int style)
+  : lmp(ptr), fix(NULL), thr_style(style), thr_error(0)
+{
+  // register fix omp with this class
+  int ifix = lmp->modify->find_fix("package_omp");
+  if (ifix < 0)
+    lmp->error->all(FLERR,"The 'package omp' command is required for /omp styles");
+  fix = static_cast<FixOMP *>(lmp->modify->fix[ifix]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+ThrOMP::~ThrOMP()
+{
+  // nothing to do?
+}
+
+/* ----------------------------------------------------------------------
+   Hook up per thread per atom arrays into the tally infrastructure
+   ---------------------------------------------------------------------- */
+
+void ThrOMP::ev_setup_thr(int eflag, int vflag, int nall, double *eatom,
+                          double **vatom, ThrData *thr)
+{
+  const int tid = thr->get_tid();
+  if (tid == 0) thr_error = 0;
+
+  if (thr_style & THR_PAIR) {
+    if (eflag & 2) {
+      thr->eatom_pair = eatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->eatom_pair[0]),0,nall*sizeof(double));
+    }
+    if (vflag & 4) {
+      thr->vatom_pair = vatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->vatom_pair[0][0]),0,nall*6*sizeof(double));
+    }
+  }
+
+  if (thr_style & THR_BOND) {
+    if (eflag & 2) {
+      thr->eatom_bond = eatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->eatom_bond[0]),0,nall*sizeof(double));
+    }
+    if (vflag & 4) {
+      thr->vatom_bond = vatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->vatom_bond[0][0]),0,nall*6*sizeof(double));
+    }
+  }
+
+  if (thr_style & THR_ANGLE) {
+    if (eflag & 2) {
+      thr->eatom_angle = eatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->eatom_angle[0]),0,nall*sizeof(double));
+    }
+    if (vflag & 4) {
+      thr->vatom_angle = vatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->vatom_angle[0][0]),0,nall*6*sizeof(double));
+    }
+  }
+
+  if (thr_style & THR_DIHEDRAL) {
+    if (eflag & 2) {
+      thr->eatom_dihed = eatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->eatom_dihed[0]),0,nall*sizeof(double));
+    }
+    if (vflag & 4) {
+      thr->vatom_dihed = vatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->vatom_dihed[0][0]),0,nall*6*sizeof(double));
+    }
+  }
+
+  if (thr_style & THR_IMPROPER) {
+    if (eflag & 2) {
+      thr->eatom_imprp = eatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->eatom_imprp[0]),0,nall*sizeof(double));
+    }
+    if (vflag & 4) {
+      thr->vatom_imprp = vatom + tid*nall;
+      if (nall > 0)
+        memset(&(thr->vatom_imprp[0][0]),0,nall*6*sizeof(double));
+    }
+  }
+
+  // nothing to do for THR_KSPACE
+}
+
+/* ----------------------------------------------------------------------
+   Reduce per thread data into the regular structures
+   Reduction of global properties is serialized with a "critical"
+   directive, so that only one thread at a time will access the
+   global variables. Since we are not synchronized, this should
+   come with little overhead. The reduction of per-atom properties
+   in contrast is parallelized over threads in the same way as forces.
+   ---------------------------------------------------------------------- */
+
+void ThrOMP::reduce_thr(void *style, const int eflag, const int vflag,
+                        ThrData *const thr, const int nproxy)
+{
+  const int nlocal = lmp->atom->nlocal;
+  const int nghost = lmp->atom->nghost;
+  const int nall = nlocal + nghost;
+  const int nfirst = lmp->atom->nfirst;
+  const int nthreads = lmp->comm->nthreads;
+  const int evflag = eflag | vflag;
+
+  const int tid = thr->get_tid();
+  double **f = lmp->atom->f;
+  double **x = lmp->atom->x;
+
+  int need_force_reduce = 1;
+
+  if (evflag)
+    sync_threads();
+
+  switch (thr_style) {
+
+  case THR_PAIR: {
+    Pair * const pair = lmp->force->pair;
+
+    if (pair->vflag_fdotr) {
+
+      if (style == fix->last_pair_hybrid) {
+        // pair_style hybrid will compute fdotr for us
+        // but we first need to reduce the forces
+        data_reduce_thr(&(f[0][0]), nall, nthreads, 3, tid);
+        need_force_reduce = 0;
+      }
+
+      // this is a non-hybrid pair style. compute per thread fdotr
+      if (fix->last_pair_hybrid == NULL) {
+        if (lmp->neighbor->includegroup == 0)
+          thr->virial_fdotr_compute(x, nlocal, nghost, -1);
+        else
+          thr->virial_fdotr_compute(x, nlocal, nghost, nfirst);
+      }
+    }
+
+    if (evflag) {
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      {
+        if (eflag & 1) {
+          pair->eng_vdwl += thr->eng_vdwl;
+          pair->eng_coul += thr->eng_coul;
+          thr->eng_vdwl = 0.0;
+          thr->eng_coul = 0.0;
+        }
+        if (vflag & 3)
+          for (int i=0; i < 6; ++i) {
+            pair->virial[i] += thr->virial_pair[i];
+            thr->virial_pair[i] = 0.0;
+          }
+      }
+      if (eflag & 2) {
+        data_reduce_thr(&(pair->eatom[0]), nall, nthreads, 1, tid);
+      }
+      if (vflag & 4) {
+        data_reduce_thr(&(pair->vatom[0][0]), nall, nthreads, 6, tid);
+      }
+    }
+  }
+    break;
+
+  case THR_PAIR|THR_PROXY: {
+    Pair * const pair = lmp->force->pair;
+
+    if (tid >= nproxy && pair->vflag_fdotr) {
+
+      if (fix->last_pair_hybrid) {
+        if (tid == nproxy)
+          lmp->error->all(FLERR,
+                          "Cannot use hybrid pair style with kspace proxy");
+        else return;
+      }
+
+      // this is a non-hybrid pair style. compute per thread fdotr
+      if (fix->last_pair_hybrid == NULL) {
+        if (lmp->neighbor->includegroup == 0)
+          thr->virial_fdotr_compute(x, nlocal, nghost, -1);
+        else
+          thr->virial_fdotr_compute(x, nlocal, nghost, nfirst);
+      }
+    }
+
+    if (evflag) {
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      {
+        if (tid < nproxy) {
+          // nothing to collect for kspace proxy threads
+          // just reset pair accumulators to 0.0.
+          if (eflag & 1) {
+            thr->eng_vdwl = 0.0;
+            thr->eng_coul = 0.0;
+          }
+          if (vflag & 3)
+            for (int i=0; i < 6; ++i) {
+              thr->virial_pair[i] = 0.0;
+            }
+        } else {
+          if (eflag & 1) {
+            pair->eng_vdwl += thr->eng_vdwl;
+            pair->eng_coul += thr->eng_coul;
+            thr->eng_vdwl = 0.0;
+            thr->eng_coul = 0.0;
+          }
+          if (vflag & 3)
+            for (int i=0; i < 6; ++i) {
+              pair->virial[i] += thr->virial_pair[i];
+              thr->virial_pair[i] = 0.0;
+            }
+        }
+      }
+      if (eflag & 2) {
+        data_reduce_thr(&(pair->eatom[0]), nall, nthreads, 1, tid);
+      }
+      if (vflag & 4) {
+        data_reduce_thr(&(pair->vatom[0][0]), nall, nthreads, 6, tid);
+      }
+    }
+  }
+    break;
+
+  case THR_BOND:
+
+    if (evflag) {
+      Bond * const bond = lmp->force->bond;
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      {
+        if (eflag & 1) {
+          bond->energy += thr->eng_bond;
+          thr->eng_bond = 0.0;
+        }
+
+        if (vflag & 3) {
+          for (int i=0; i < 6; ++i) {
+            bond->virial[i] += thr->virial_bond[i];
+            thr->virial_bond[i] = 0.0;
+          }
+        }
+      }
+
+      if (eflag & 2) {
+        data_reduce_thr(&(bond->eatom[0]), nall, nthreads, 1, tid);
+      }
+      if (vflag & 4) {
+        data_reduce_thr(&(bond->vatom[0][0]), nall, nthreads, 6, tid);
+      }
+
+    }
+    break;
+
+  case THR_ANGLE:
+
+    if (evflag) {
+      Angle * const angle = lmp->force->angle;
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      {
+        if (eflag & 1) {
+          angle->energy += thr->eng_angle;
+          thr->eng_angle = 0.0;
+        }
+
+        if (vflag & 3) {
+          for (int i=0; i < 6; ++i) {
+            angle->virial[i] += thr->virial_angle[i];
+            thr->virial_angle[i] = 0.0;
+          }
+        }
+      }
+
+      if (eflag & 2) {
+        data_reduce_thr(&(angle->eatom[0]), nall, nthreads, 1, tid);
+      }
+      if (vflag & 4) {
+        data_reduce_thr(&(angle->vatom[0][0]), nall, nthreads, 6, tid);
+      }
+
+    }
+    break;
+
+  case THR_DIHEDRAL:
+
+    if (evflag) {
+      Dihedral * const dihedral = lmp->force->dihedral;
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      {
+        if (eflag & 1) {
+          dihedral->energy += thr->eng_dihed;
+          thr->eng_dihed = 0.0;
+        }
+
+        if (vflag & 3) {
+          for (int i=0; i < 6; ++i) {
+            dihedral->virial[i] += thr->virial_dihed[i];
+            thr->virial_dihed[i] = 0.0;
+          }
+        }
+      }
+
+      if (eflag & 2) {
+        data_reduce_thr(&(dihedral->eatom[0]), nall, nthreads, 1, tid);
+      }
+      if (vflag & 4) {
+        data_reduce_thr(&(dihedral->vatom[0][0]), nall, nthreads, 6, tid);
+      }
+
+    }
+    break;
+
+  case THR_DIHEDRAL|THR_CHARMM: // special case for CHARMM dihedrals
+
+    if (evflag) {
+      Dihedral * const dihedral = lmp->force->dihedral;
+      Pair * const pair = lmp->force->pair;
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      {
+        if (eflag & 1) {
+          dihedral->energy += thr->eng_dihed;
+          pair->eng_vdwl += thr->eng_vdwl;
+          pair->eng_coul += thr->eng_coul;
+          thr->eng_dihed = 0.0;
+          thr->eng_vdwl = 0.0;
+          thr->eng_coul = 0.0;
+        }
+
+        if (vflag & 3) {
+          for (int i=0; i < 6; ++i) {
+            dihedral->virial[i] += thr->virial_dihed[i];
+            pair->virial[i] += thr->virial_pair[i];
+            thr->virial_dihed[i] = 0.0;
+            thr->virial_pair[i] = 0.0;
+          }
+        }
+      }
+
+      if (eflag & 2) {
+        data_reduce_thr(&(dihedral->eatom[0]), nall, nthreads, 1, tid);
+        data_reduce_thr(&(pair->eatom[0]), nall, nthreads, 1, tid);
+      }
+      if (vflag & 4) {
+        data_reduce_thr(&(dihedral->vatom[0][0]), nall, nthreads, 6, tid);
+        data_reduce_thr(&(pair->vatom[0][0]), nall, nthreads, 6, tid);
+      }
+    }
+    break;
+
+  case THR_IMPROPER:
+
+    if (evflag) {
+      Improper *improper = lmp->force->improper;
+#if defined(_OPENMP)
+#pragma omp critical
+#endif
+      {
+        if (eflag & 1) {
+          improper->energy += thr->eng_imprp;
+          thr->eng_imprp = 0.0;
+        }
+
+        if (vflag & 3) {
+          for (int i=0; i < 6; ++i) {
+            improper->virial[i] += thr->virial_imprp[i];
+            thr->virial_imprp[i] = 0.0;
+          }
+        }
+      }
+
+      if (eflag & 2) {
+        data_reduce_thr(&(improper->eatom[0]), nall, nthreads, 1, tid);
+      }
+      if (vflag & 4) {
+        data_reduce_thr(&(improper->vatom[0][0]), nall, nthreads, 6, tid);
+      }
+
+    }
+    break;
+
+  case THR_KSPACE|THR_PROXY: // fallthrough
+  case THR_KSPACE:
+    // nothing to do
+    break;
+
+  default:
+    printf("tid:%d unhandled thr_style case %d\n", tid, thr_style);
+    break;
+  }
+
+  if (style == fix->last_omp_style) {
+    if (need_force_reduce)
+      data_reduce_thr(&(f[0][0]), nall, nthreads, 3, tid);
+
+    if (lmp->atom->torque)
+      data_reduce_thr(&(lmp->atom->torque[0][0]), nall, nthreads, 3, tid);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and eng_coul into per thread global and per-atom accumulators
+------------------------------------------------------------------------- */
+
+void ThrOMP::e_tally_thr(Pair * const pair, const int i, const int j,
+                         const int nlocal, const int newton_pair,
+                         const double evdwl, const double ecoul, ThrData * const thr)
+{
+  if (pair->eflag_global) {
+    if (newton_pair) {
+      thr->eng_vdwl += evdwl;
+      thr->eng_coul += ecoul;
+    } else {
+      const double evdwlhalf = 0.5*evdwl;
+      const double ecoulhalf = 0.5*ecoul;
+      if (i < nlocal) {
+        thr->eng_vdwl += evdwlhalf;
+        thr->eng_coul += ecoulhalf;
+      }
+      if (j < nlocal) {
+        thr->eng_vdwl += evdwlhalf;
+        thr->eng_coul += ecoulhalf;
+      }
+    }
+  }
+  if (pair->eflag_atom) {
+    const double epairhalf = 0.5 * (evdwl + ecoul);
+    if (newton_pair || i < nlocal) thr->eatom_pair[i] += epairhalf;
+    if (newton_pair || j < nlocal) thr->eatom_pair[j] += epairhalf;
+  }
+}
+
+/* helper functions */
+static void v_tally(double * const vout, const double * const vin)
+{
+  vout[0] += vin[0];
+  vout[1] += vin[1];
+  vout[2] += vin[2];
+  vout[3] += vin[3];
+  vout[4] += vin[4];
+  vout[5] += vin[5];
+}
+
+static void v_tally(double * const vout, const double scale, const double * const vin)
+{
+  vout[0] += scale*vin[0];
+  vout[1] += scale*vin[1];
+  vout[2] += scale*vin[2];
+  vout[3] += scale*vin[3];
+  vout[4] += scale*vin[4];
+  vout[5] += scale*vin[5];
+}
+
+/* ----------------------------------------------------------------------
+   tally virial into per thread global and per-atom accumulators
+------------------------------------------------------------------------- */
+void ThrOMP::v_tally_thr(Pair * const pair, const int i, const int j,
+                         const int nlocal, const int newton_pair,
+                         const double * const v, ThrData * const thr)
+{
+  if (pair->vflag_global) {
+    double * const va = thr->virial_pair;
+    if (newton_pair) {
+      v_tally(va,v);
+    } else {
+      if (i < nlocal) v_tally(va,0.5,v);
+      if (j < nlocal) v_tally(va,0.5,v);
+    }
+  }
+
+  if (pair->vflag_atom) {
+    if (newton_pair || i < nlocal) {
+      double * const va = thr->vatom_pair[i];
+      v_tally(va,0.5,v);
+    }
+    if (newton_pair || j < nlocal) {
+      double * const va = thr->vatom_pair[j];
+      v_tally(va,0.5,v);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into per thread global and per-atom accumulators
+   need i < nlocal test since called by bond_quartic and dihedral_charmm
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_thr(Pair * const pair, const int i, const int j, const int nlocal,
+                          const int newton_pair, const double evdwl, const double ecoul,
+                          const double fpair, const double delx, const double dely,
+                          const double delz, ThrData * const thr)
+{
+
+  if (pair->eflag_either)
+    e_tally_thr(pair, i, j, nlocal, newton_pair, evdwl, ecoul, thr);
+
+  if (pair->vflag_either) {
+    double v[6];
+    v[0] = delx*delx*fpair;
+    v[1] = dely*dely*fpair;
+    v[2] = delz*delz*fpair;
+    v[3] = delx*dely*fpair;
+    v[4] = delx*delz*fpair;
+    v[5] = dely*delz*fpair;
+
+    v_tally_thr(pair, i, j, nlocal, newton_pair, v, thr);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into global and per-atom accumulators
+   for virial, have delx,dely,delz and fx,fy,fz
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_xyz_thr(Pair * const pair, const int i, const int j,
+                              const int nlocal, const int newton_pair,
+                              const double evdwl, const double ecoul,
+                              const double fx, const double fy, const double fz,
+                              const double delx, const double dely, const double delz,
+                              ThrData * const thr)
+{
+
+  if (pair->eflag_either)
+    e_tally_thr(pair, i, j, nlocal, newton_pair, evdwl, ecoul, thr);
+
+  if (pair->vflag_either) {
+    double v[6];
+    v[0] = delx*fx;
+    v[1] = dely*fy;
+    v[2] = delz*fz;
+    v[3] = delx*fy;
+    v[4] = delx*fz;
+    v[5] = dely*fz;
+
+    v_tally_thr(pair, i, j, nlocal, newton_pair, v, thr);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into global and per-atom accumulators
+   called by SW and hbond potentials, newton_pair is always on
+   virial = riFi + rjFj + rkFk = (rj-ri) Fj + (rk-ri) Fk = drji*fj + drki*fk
+ ------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally3_thr(Pair * const pair, const int i, const int j, const int k,
+                           const double evdwl, const double ecoul,
+                           const double * const fj, const double * const fk,
+                           const double * const drji, const double * const drki,
+                           ThrData * const thr)
+{
+  if (pair->eflag_either) {
+    if (pair->eflag_global) {
+      thr->eng_vdwl += evdwl;
+      thr->eng_coul += ecoul;
+    }
+    if (pair->eflag_atom) {
+      const double epairthird = THIRD * (evdwl + ecoul);
+      thr->eatom_pair[i] += epairthird;
+      thr->eatom_pair[j] += epairthird;
+      thr->eatom_pair[k] += epairthird;
+    }
+  }
+
+  if (pair->vflag_either) {
+    double v[6];
+
+    v[0] = drji[0]*fj[0] + drki[0]*fk[0];
+    v[1] = drji[1]*fj[1] + drki[1]*fk[1];
+    v[2] = drji[2]*fj[2] + drki[2]*fk[2];
+    v[3] = drji[0]*fj[1] + drki[0]*fk[1];
+    v[4] = drji[0]*fj[2] + drki[0]*fk[2];
+    v[5] = drji[1]*fj[2] + drki[1]*fk[2];
+
+    if (pair->vflag_global) v_tally(thr->virial_pair,v);
+
+    if (pair->vflag_atom) {
+      v_tally(thr->vatom_pair[i],THIRD,v);
+      v_tally(thr->vatom_pair[j],THIRD,v);
+      v_tally(thr->vatom_pair[k],THIRD,v);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally eng_vdwl and virial into global and per-atom accumulators
+   called by AIREBO potential, newton_pair is always on
+ ------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally4_thr(Pair * const pair, const int i, const int j,
+                           const int k, const int m, const double evdwl,
+                           const double * const fi, const double * const fj,
+                           const double * const fk, const double * const drim,
+                           const double * const drjm, const double * const drkm,
+                           ThrData * const thr)
+{
+  double v[6];
+
+  if (pair->eflag_either) {
+    if (pair->eflag_global) thr->eng_vdwl += evdwl;
+    if (pair->eflag_atom) {
+      const double epairfourth = 0.25 * evdwl;
+      thr->eatom_pair[i] += epairfourth;
+      thr->eatom_pair[j] += epairfourth;
+      thr->eatom_pair[k] += epairfourth;
+      thr->eatom_pair[m] += epairfourth;
+    }
+  }
+
+  if (pair->vflag_atom) {
+    v[0] = 0.25 * (drim[0]*fi[0] + drjm[0]*fj[0] + drkm[0]*fk[0]);
+    v[1] = 0.25 * (drim[1]*fi[1] + drjm[1]*fj[1] + drkm[1]*fk[1]);
+    v[2] = 0.25 * (drim[2]*fi[2] + drjm[2]*fj[2] + drkm[2]*fk[2]);
+    v[3] = 0.25 * (drim[0]*fi[1] + drjm[0]*fj[1] + drkm[0]*fk[1]);
+    v[4] = 0.25 * (drim[0]*fi[2] + drjm[0]*fj[2] + drkm[0]*fk[2]);
+    v[5] = 0.25 * (drim[1]*fi[2] + drjm[1]*fj[2] + drkm[1]*fk[2]);
+
+    v_tally(thr->vatom_pair[i],v);
+    v_tally(thr->vatom_pair[j],v);
+    v_tally(thr->vatom_pair[k],v);
+    v_tally(thr->vatom_pair[m],v);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally ecoul and virial into each of n atoms in list
+   called by TIP4P potential, newton_pair is always on
+   changes v values by dividing by n
+ ------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_list_thr(Pair * const pair, const int n,
+                               const int * const list, const double ecoul,
+                               const double * const v, ThrData * const thr)
+{
+  if (pair->eflag_either) {
+    if (pair->eflag_global) thr->eng_coul += ecoul;
+    if (pair->eflag_atom) {
+      double epairatom = ecoul/static_cast<double>(n);
+      for (int i = 0; i < n; i++) thr->eatom_pair[list[i]] += epairatom;
+    }
+  }
+
+  if (pair->vflag_either) {
+    if (pair->vflag_global)
+      v_tally(thr->virial_pair,v);
+
+    if (pair->vflag_atom) {
+      const double s = 1.0/static_cast<double>(n);
+      double vtmp[6];
+
+      vtmp[0] = s * v[0];
+      vtmp[1] = s * v[1];
+      vtmp[2] = s * v[2];
+      vtmp[3] = s * v[3];
+      vtmp[4] = s * v[4];
+      vtmp[5] = s * v[5];
+
+      for (int i = 0; i < n; i++) {
+        const int j = list[i];
+        v_tally(thr->vatom_pair[j],vtmp);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally energy and virial into global and per-atom accumulators
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_thr(Bond * const bond, const int i, const int j, const int nlocal,
+                          const int newton_bond, const double ebond, const double fbond,
+                          const double delx, const double dely, const double delz,
+                          ThrData * const thr)
+{
+  if (bond->eflag_either) {
+    const double ebondhalf = 0.5*ebond;
+    if (newton_bond) {
+      if (bond->eflag_global)
+        thr->eng_bond += ebond;
+      if (bond->eflag_atom) {
+        thr->eatom_bond[i] += ebondhalf;
+        thr->eatom_bond[j] += ebondhalf;
+      }
+    } else {
+      if (bond->eflag_global) {
+        if (i < nlocal) thr->eng_bond += ebondhalf;
+        if (j < nlocal) thr->eng_bond += ebondhalf;
+      }
+      if (bond->eflag_atom) {
+        if (i < nlocal) thr->eatom_bond[i] += ebondhalf;
+        if (j < nlocal) thr->eatom_bond[j] += ebondhalf;
+      }
+    }
+  }
+
+  if (bond->vflag_either) {
+    double v[6];
+
+    v[0] = delx*delx*fbond;
+    v[1] = dely*dely*fbond;
+    v[2] = delz*delz*fbond;
+    v[3] = delx*dely*fbond;
+    v[4] = delx*delz*fbond;
+    v[5] = dely*delz*fbond;
+
+    if (bond->vflag_global) {
+      if (newton_bond)
+        v_tally(thr->virial_bond,v);
+      else {
+        if (i < nlocal)
+          v_tally(thr->virial_bond,0.5,v);
+        if (j < nlocal)
+          v_tally(thr->virial_bond,0.5,v);
+      }
+    }
+
+    if (bond->vflag_atom) {
+      v[0] *= 0.5;
+      v[1] *= 0.5;
+      v[2] *= 0.5;
+      v[3] *= 0.5;
+      v[4] *= 0.5;
+      v[5] *= 0.5;
+
+      if (newton_bond) {
+        v_tally(thr->vatom_bond[i],v);
+        v_tally(thr->vatom_bond[j],v);
+      } else {
+        if (j < nlocal)
+          v_tally(thr->vatom_bond[i],v);
+        if (j < nlocal)
+          v_tally(thr->vatom_bond[j],v);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally energy and virial into global and per-atom accumulators
+   virial = r1F1 + r2F2 + r3F3 = (r1-r2) F1 + (r3-r2) F3 = del1*f1 + del2*f3
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_thr(Angle * const angle, const int i, const int j, const int k,
+                          const int nlocal, const int newton_bond, const double eangle,
+                          const double * const f1, const double * const f3,
+                          const double delx1, const double dely1, const double delz1,
+                          const double delx2, const double dely2, const double delz2,
+                          ThrData * const thr)
+{
+  if (angle->eflag_either) {
+    const double eanglethird = THIRD*eangle;
+    if (newton_bond) {
+      if (angle->eflag_global)
+        thr->eng_angle += eangle;
+      if (angle->eflag_atom) {
+        thr->eatom_angle[i] += eanglethird;
+        thr->eatom_angle[j] += eanglethird;
+        thr->eatom_angle[k] += eanglethird;
+      }
+    } else {
+      if (angle->eflag_global) {
+        if (i < nlocal) thr->eng_angle += eanglethird;
+        if (j < nlocal) thr->eng_angle += eanglethird;
+        if (k < nlocal) thr->eng_angle += eanglethird;
+      }
+      if (angle->eflag_atom) {
+        if (i < nlocal) thr->eatom_angle[i] += eanglethird;
+        if (j < nlocal) thr->eatom_angle[j] += eanglethird;
+        if (k < nlocal) thr->eatom_angle[k] += eanglethird;
+      }
+    }
+  }
+
+  if (angle->vflag_either) {
+    double v[6];
+
+    v[0] = delx1*f1[0] + delx2*f3[0];
+    v[1] = dely1*f1[1] + dely2*f3[1];
+    v[2] = delz1*f1[2] + delz2*f3[2];
+    v[3] = delx1*f1[1] + delx2*f3[1];
+    v[4] = delx1*f1[2] + delx2*f3[2];
+    v[5] = dely1*f1[2] + dely2*f3[2];
+
+    if (angle->vflag_global) {
+      if (newton_bond) {
+        v_tally(thr->virial_angle,v);
+      } else {
+        int cnt = 0;
+        if (i < nlocal) ++cnt;
+        if (j < nlocal) ++cnt;
+        if (k < nlocal) ++cnt;
+        v_tally(thr->virial_angle,cnt*THIRD,v);
+      }
+    }
+
+    if (angle->vflag_atom) {
+      v[0] *= THIRD;
+      v[1] *= THIRD;
+      v[2] *= THIRD;
+      v[3] *= THIRD;
+      v[4] *= THIRD;
+      v[5] *= THIRD;
+
+      if (newton_bond) {
+        v_tally(thr->vatom_angle[i],v);
+        v_tally(thr->vatom_angle[j],v);
+        v_tally(thr->vatom_angle[k],v);
+      } else {
+        if (j < nlocal) v_tally(thr->vatom_angle[i],v);
+        if (j < nlocal) v_tally(thr->vatom_angle[j],v);
+        if (k < nlocal) v_tally(thr->vatom_angle[k],v);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally energy and virial from 1-3 repulsion of SDK angle into accumulators
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally13_thr(Angle * const angle, const int i1, const int i3,
+                            const int nlocal, const int newton_bond,
+                            const double epair, const double fpair,
+                            const double delx, const double dely,
+                            const double delz, ThrData * const thr)
+{
+
+  if (angle->eflag_either) {
+    const double epairhalf = 0.5 * epair;
+
+    if (angle->eflag_global) {
+      if (newton_bond || i1 < nlocal)
+        thr->eng_angle += epairhalf;
+      if (newton_bond || i3 < nlocal)
+        thr->eng_angle += epairhalf;
+    }
+
+    if (angle->eflag_atom) {
+      if (newton_bond || i1 < nlocal) thr->eatom_angle[i1] += epairhalf;
+      if (newton_bond || i3 < nlocal) thr->eatom_angle[i3] += epairhalf;
+    }
+  }
+
+  if (angle->vflag_either) {
+    double v[6];
+    v[0] = delx*delx*fpair;
+    v[1] = dely*dely*fpair;
+    v[2] = delz*delz*fpair;
+    v[3] = delx*dely*fpair;
+    v[4] = delx*delz*fpair;
+    v[5] = dely*delz*fpair;
+
+    if (angle->vflag_global) {
+      double * const va = thr->virial_angle;
+      if (newton_bond || i1 < nlocal) v_tally(va,0.5,v);
+      if (newton_bond || i3 < nlocal) v_tally(va,0.5,v);
+    }
+
+    if (angle->vflag_atom) {
+      if (newton_bond || i1 < nlocal) {
+        double * const va = thr->vatom_angle[i1];
+        v_tally(va,0.5,v);
+      }
+      if (newton_bond || i3 < nlocal) {
+        double * const va = thr->vatom_angle[i3];
+        v_tally(va,0.5,v);
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   tally energy and virial into global and per-atom accumulators
+   virial = r1F1 + r2F2 + r3F3 + r4F4 = (r1-r2) F1 + (r3-r2) F3 + (r4-r2) F4
+          = (r1-r2) F1 + (r3-r2) F3 + (r4-r3 + r3-r2) F4
+          = vb1*f1 + vb2*f3 + (vb3+vb2)*f4
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_thr(Dihedral * const dihed, const int i1, const int i2,
+                          const int i3, const int i4, const int nlocal,
+                          const int newton_bond, const double edihedral,
+                          const double * const f1, const double * const f3,
+                          const double * const f4, const double vb1x,
+                          const double vb1y, const double vb1z, const double vb2x,
+                          const double vb2y, const double vb2z, const double vb3x,
+                          const double vb3y, const double vb3z, ThrData * const thr)
+{
+
+  if (dihed->eflag_either) {
+    if (dihed->eflag_global) {
+      if (newton_bond) {
+        thr->eng_dihed += edihedral;
+      } else {
+        const double edihedralquarter = 0.25*edihedral;
+        int cnt = 0;
+        if (i1 < nlocal) ++cnt;
+        if (i2 < nlocal) ++cnt;
+        if (i3 < nlocal) ++cnt;
+        if (i4 < nlocal) ++cnt;
+        thr->eng_dihed += static_cast<double>(cnt)*edihedralquarter;
+      }
+    }
+    if (dihed->eflag_atom) {
+      const double edihedralquarter = 0.25*edihedral;
+      if (newton_bond) {
+        thr->eatom_dihed[i1] += edihedralquarter;
+        thr->eatom_dihed[i2] += edihedralquarter;
+        thr->eatom_dihed[i3] += edihedralquarter;
+        thr->eatom_dihed[i4] += edihedralquarter;
+      } else {
+        if (i1 < nlocal) thr->eatom_dihed[i1] +=  edihedralquarter;
+        if (i2 < nlocal) thr->eatom_dihed[i2] +=  edihedralquarter;
+        if (i3 < nlocal) thr->eatom_dihed[i3] +=  edihedralquarter;
+        if (i4 < nlocal) thr->eatom_dihed[i4] +=  edihedralquarter;
+      }
+    }
+  }
+
+  if (dihed->vflag_either) {
+    double v[6];
+    v[0] = vb1x*f1[0] + vb2x*f3[0] + (vb3x+vb2x)*f4[0];
+    v[1] = vb1y*f1[1] + vb2y*f3[1] + (vb3y+vb2y)*f4[1];
+    v[2] = vb1z*f1[2] + vb2z*f3[2] + (vb3z+vb2z)*f4[2];
+    v[3] = vb1x*f1[1] + vb2x*f3[1] + (vb3x+vb2x)*f4[1];
+    v[4] = vb1x*f1[2] + vb2x*f3[2] + (vb3x+vb2x)*f4[2];
+    v[5] = vb1y*f1[2] + vb2y*f3[2] + (vb3y+vb2y)*f4[2];
+
+    if (dihed->vflag_global) {
+      if (newton_bond) {
+        v_tally(thr->virial_dihed,v);
+      } else {
+        int cnt = 0;
+        if (i1 < nlocal) ++cnt;
+        if (i2 < nlocal) ++cnt;
+        if (i3 < nlocal) ++cnt;
+        if (i4 < nlocal) ++cnt;
+        v_tally(thr->virial_dihed,0.25*static_cast<double>(cnt),v);
+      }
+    }
+
+    v[0] *= 0.25;
+    v[1] *= 0.25;
+    v[2] *= 0.25;
+    v[3] *= 0.25;
+    v[4] *= 0.25;
+    v[5] *= 0.25;
+
+    if (dihed->vflag_atom) {
+      if (newton_bond) {
+        v_tally(thr->vatom_dihed[i1],v);
+        v_tally(thr->vatom_dihed[i2],v);
+        v_tally(thr->vatom_dihed[i3],v);
+        v_tally(thr->vatom_dihed[i4],v);
+      } else {
+        if (i1 < nlocal) v_tally(thr->vatom_dihed[i1],v);
+        if (i2 < nlocal) v_tally(thr->vatom_dihed[i2],v);
+        if (i3 < nlocal) v_tally(thr->vatom_dihed[i3],v);
+        if (i4 < nlocal) v_tally(thr->vatom_dihed[i4],v);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally energy and virial into global and per-atom accumulators
+   virial = r1F1 + r2F2 + r3F3 + r4F4 = (r1-r2) F1 + (r3-r2) F3 + (r4-r2) F4
+          = (r1-r2) F1 + (r3-r2) F3 + (r4-r3 + r3-r2) F4
+          = vb1*f1 + vb2*f3 + (vb3+vb2)*f4
+------------------------------------------------------------------------- */
+
+void ThrOMP::ev_tally_thr(Improper * const imprp, const int i1, const int i2,
+                          const int i3, const int i4, const int nlocal,
+                          const int newton_bond, const double eimproper,
+                          const double * const f1, const double * const f3,
+                          const double * const f4, const double vb1x,
+                          const double vb1y, const double vb1z, const double vb2x,
+                          const double vb2y, const double vb2z, const double vb3x,
+                          const double vb3y, const double vb3z, ThrData * const thr)
+{
+
+  if (imprp->eflag_either) {
+    if (imprp->eflag_global) {
+      if (newton_bond) {
+        thr->eng_imprp += eimproper;
+      } else {
+        const double eimproperquarter = 0.25*eimproper;
+        int cnt = 0;
+        if (i1 < nlocal) ++cnt;
+        if (i2 < nlocal) ++cnt;
+        if (i3 < nlocal) ++cnt;
+        if (i4 < nlocal) ++cnt;
+        thr->eng_imprp += static_cast<double>(cnt)*eimproperquarter;
+      }
+    }
+    if (imprp->eflag_atom) {
+      const double eimproperquarter = 0.25*eimproper;
+      if (newton_bond) {
+        thr->eatom_imprp[i1] += eimproperquarter;
+        thr->eatom_imprp[i2] += eimproperquarter;
+        thr->eatom_imprp[i3] += eimproperquarter;
+        thr->eatom_imprp[i4] += eimproperquarter;
+      } else {
+        if (i1 < nlocal) thr->eatom_imprp[i1] +=  eimproperquarter;
+        if (i2 < nlocal) thr->eatom_imprp[i2] +=  eimproperquarter;
+        if (i3 < nlocal) thr->eatom_imprp[i3] +=  eimproperquarter;
+        if (i4 < nlocal) thr->eatom_imprp[i4] +=  eimproperquarter;
+      }
+    }
+  }
+
+  if (imprp->vflag_either) {
+    double v[6];
+    v[0] = vb1x*f1[0] + vb2x*f3[0] + (vb3x+vb2x)*f4[0];
+    v[1] = vb1y*f1[1] + vb2y*f3[1] + (vb3y+vb2y)*f4[1];
+    v[2] = vb1z*f1[2] + vb2z*f3[2] + (vb3z+vb2z)*f4[2];
+    v[3] = vb1x*f1[1] + vb2x*f3[1] + (vb3x+vb2x)*f4[1];
+    v[4] = vb1x*f1[2] + vb2x*f3[2] + (vb3x+vb2x)*f4[2];
+    v[5] = vb1y*f1[2] + vb2y*f3[2] + (vb3y+vb2y)*f4[2];
+
+    if (imprp->vflag_global) {
+      if (newton_bond) {
+        v_tally(thr->virial_imprp,v);
+      } else {
+        int cnt = 0;
+        if (i1 < nlocal) ++cnt;
+        if (i2 < nlocal) ++cnt;
+        if (i3 < nlocal) ++cnt;
+        if (i4 < nlocal) ++cnt;
+        v_tally(thr->virial_imprp,0.25*static_cast<double>(cnt),v);
+      }
+    }
+
+    v[0] *= 0.25;
+    v[1] *= 0.25;
+    v[2] *= 0.25;
+    v[3] *= 0.25;
+    v[4] *= 0.25;
+    v[5] *= 0.25;
+
+    if (imprp->vflag_atom) {
+      if (newton_bond) {
+        v_tally(thr->vatom_imprp[i1],v);
+        v_tally(thr->vatom_imprp[i2],v);
+        v_tally(thr->vatom_imprp[i3],v);
+        v_tally(thr->vatom_imprp[i4],v);
+      } else {
+        if (i1 < nlocal) v_tally(thr->vatom_imprp[i1],v);
+        if (i2 < nlocal) v_tally(thr->vatom_imprp[i2],v);
+        if (i3 < nlocal) v_tally(thr->vatom_imprp[i3],v);
+        if (i4 < nlocal) v_tally(thr->vatom_imprp[i4],v);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   tally virial into per-atom accumulators
+   called by AIREBO potential, newton_pair is always on
+   fpair is magnitude of force on atom I
+------------------------------------------------------------------------- */
+
+void ThrOMP::v_tally2_thr(const int i, const int j, const double fpair,
+                          const double * const drij, ThrData * const thr)
+{
+  double v[6];
+
+  v[0] = 0.5 * drij[0]*drij[0]*fpair;
+  v[1] = 0.5 * drij[1]*drij[1]*fpair;
+  v[2] = 0.5 * drij[2]*drij[2]*fpair;
+  v[3] = 0.5 * drij[0]*drij[1]*fpair;
+  v[4] = 0.5 * drij[0]*drij[2]*fpair;
+  v[5] = 0.5 * drij[1]*drij[2]*fpair;
+
+  v_tally(thr->vatom_pair[i],v);
+  v_tally(thr->vatom_pair[j],v);
+}
+
+/* ----------------------------------------------------------------------
+   tally virial into per-atom accumulators
+   called by AIREBO and Tersoff potential, newton_pair is always on
+------------------------------------------------------------------------- */
+
+void ThrOMP::v_tally3_thr(const int i, const int j, const int k,
+                          const double * const fi, const double * const fj,
+                          const double * const drik, const double * const drjk,
+                          ThrData * const thr)
+{
+  double v[6];
+
+  v[0] = THIRD * (drik[0]*fi[0] + drjk[0]*fj[0]);
+  v[1] = THIRD * (drik[1]*fi[1] + drjk[1]*fj[1]);
+  v[2] = THIRD * (drik[2]*fi[2] + drjk[2]*fj[2]);
+  v[3] = THIRD * (drik[0]*fi[1] + drjk[0]*fj[1]);
+  v[4] = THIRD * (drik[0]*fi[2] + drjk[0]*fj[2]);
+  v[5] = THIRD * (drik[1]*fi[2] + drjk[1]*fj[2]);
+
+  v_tally(thr->vatom_pair[i],v);
+  v_tally(thr->vatom_pair[j],v);
+  v_tally(thr->vatom_pair[k],v);
+}
+
+/* ----------------------------------------------------------------------
+   tally virial into per-atom accumulators
+   called by AIREBO potential, newton_pair is always on
+------------------------------------------------------------------------- */
+
+void ThrOMP::v_tally4_thr(const int i, const int j, const int k, const int m,
+                          const double * const fi, const double * const fj,
+                          const double * const fk, const double * const drim,
+                          const double * const drjm, const double * const drkm,
+                          ThrData * const thr)
+{
+  double v[6];
+
+  v[0] = 0.25 * (drim[0]*fi[0] + drjm[0]*fj[0] + drkm[0]*fk[0]);
+  v[1] = 0.25 * (drim[1]*fi[1] + drjm[1]*fj[1] + drkm[1]*fk[1]);
+  v[2] = 0.25 * (drim[2]*fi[2] + drjm[2]*fj[2] + drkm[2]*fk[2]);
+  v[3] = 0.25 * (drim[0]*fi[1] + drjm[0]*fj[1] + drkm[0]*fk[1]);
+  v[4] = 0.25 * (drim[0]*fi[2] + drjm[0]*fj[2] + drkm[0]*fk[2]);
+  v[5] = 0.25 * (drim[1]*fi[2] + drjm[1]*fj[2] + drkm[1]*fk[2]);
+
+  v_tally(thr->vatom_pair[i],v);
+  v_tally(thr->vatom_pair[j],v);
+  v_tally(thr->vatom_pair[k],v);
+  v_tally(thr->vatom_pair[m],v);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ThrOMP::memory_usage_thr()
+{
+  double bytes=0.0;
+
+  return bytes;
+}
diff --git a/src/USER-OMP/thr_omp.h b/src/USER-OMP/thr_omp.h
new file mode 100644
index 0000000000..19e50031f8
--- /dev/null
+++ b/src/USER-OMP/thr_omp.h
@@ -0,0 +1,187 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_THR_OMP_H
+#define LMP_THR_OMP_H
+
+#include "pointers.h"
+#include "error.h"
+#include "fix_omp.h"
+#include "thr_data.h"
+
+namespace LAMMPS_NS {
+
+// forward declarations
+class Pair;
+class Bond;
+class Angle;
+class Dihedral;
+class Improper;
+class KSpace;
+class Fix;
+
+class ThrOMP {
+
+ protected:
+  LAMMPS *lmp; // reference to base lammps object.
+  FixOMP *fix; // pointer to fix_omp;
+
+  const int thr_style;
+  int thr_error;
+
+ public:
+  ThrOMP(LAMMPS *, int);
+  virtual ~ThrOMP();
+
+  double memory_usage_thr();
+
+  inline void sync_threads() {
+#if defined(_OPENMP)
+#pragma omp barrier
+#endif
+      { ; }
+    };
+
+  enum {THR_NONE=0,THR_PAIR=1,THR_BOND=1<<1,THR_ANGLE=1<<2,
+        THR_DIHEDRAL=1<<3,THR_IMPROPER=1<<4,THR_KSPACE=1<<5,
+        THR_CHARMM=1<<6,THR_PROXY=1<<7,THR_HYBRID=1<<8,THR_FIX=1<<9};
+
+ protected:
+  // extra ev_tally setup work for threaded styles
+  void ev_setup_thr(int, int, int, double *, double **, ThrData *);
+
+  // compute global per thread virial contribution from per-thread force
+  void virial_fdotr_compute_thr(double * const, const double * const * const,
+                                const double * const * const,
+                                const int, const int, const int);
+
+  // reduce per thread data as needed
+  void reduce_thr(void * const style, const int eflag, const int vflag,
+                  ThrData * const thr, const int nproxy=0);
+
+  // thread safe variant error abort support.
+  // signals an error condition in any thread by making
+  // thr_error > 0, if condition "cond" is true.
+  // will abort from thread 0 if thr_error is > 0
+  // otherwise return true.
+  // returns false if no error found on any thread.
+  // use return value to jump/return to end of threaded region.
+
+  bool check_error_thr(const bool cond, const int tid, const char *fname,
+                       const int line, const char *errmsg) {
+    if (cond) {
+#if defined(_OPENMP)
+#pragma omp atomic
+      ++thr_error;
+#endif
+      if (tid > 0) return true;
+      else lmp->error->one(fname,line,errmsg);
+    } else {
+      if (thr_error > 0) {
+        if (tid == 0) lmp->error->one(fname,line,errmsg);
+        else return true;
+      } else return false;
+    }
+    return false;
+  };
+
+ protected:
+
+  // threading adapted versions of the ev_tally infrastructure
+  // style specific versions (need access to style class flags)
+
+  // Pair
+  void e_tally_thr(Pair * const, const int, const int, const int,
+                   const int, const double, const double, ThrData * const);
+  void v_tally_thr(Pair * const, const int, const int, const int,
+                   const int, const double * const, ThrData * const);
+
+  void ev_tally_thr(Pair * const, const int, const int, const int, const int,
+                    const double, const double, const double, const double,
+                    const double, const double, ThrData * const);
+  void ev_tally_xyz_thr(Pair * const, const int, const int, const int,
+                        const int, const double, const double, const double,
+                        const double, const double, const double,
+                        const double, const double, ThrData * const);
+  void ev_tally3_thr(Pair * const, const int, const int, const int, const double,
+                     const double, const double * const, const double * const,
+                     const double * const, const double * const, ThrData * const);
+  void ev_tally4_thr(Pair * const, const int, const int, const int, const int,
+                     const double, const double * const, const double * const,
+                     const double * const, const double * const, const double * const,
+                     const double * const, ThrData * const);
+
+  // Bond
+  void ev_tally_thr(Bond * const, const int, const int, const int, const int,
+                    const double, const double, const double, const double,
+                    const double, ThrData * const);
+
+  // Angle
+  void ev_tally_thr(Angle * const, const int, const int, const int, const int, const int,
+                    const double, const double * const, const double * const,
+                    const double, const double, const double, const double, const double,
+                    const double, ThrData * const thr);
+  void ev_tally13_thr(Angle * const, const int, const int, const int, const int,
+                      const double, const double, const double, const double,
+                      const double, ThrData * const thr);
+
+  // Dihedral
+  void ev_tally_thr(Dihedral * const, const int, const int, const int, const int, const int,
+                    const int, const double, const double * const, const double * const,
+                    const double * const, const double, const double, const double,
+                    const double, const double, const double, const double, const double,
+                    const double, ThrData * const);
+
+  // Improper
+  void ev_tally_thr(Improper * const, const int, const int, const int, const int, const int,
+                    const int, const double, const double * const, const double * const,
+                    const double * const, const double, const double, const double,
+                    const double, const double, const double, const double, const double,
+                    const double, ThrData * const);
+
+  // style independent versions
+  void v_tally2_thr(const int, const int, const double, const double * const, ThrData * const);
+  void v_tally3_thr(const int, const int, const int, const double * const, const double * const,
+                    const double * const, const double * const, ThrData * const);
+  void v_tally4_thr(const int, const int, const int, const int, const double * const,
+                    const double * const, const double * const, const double * const,
+                    const double * const, const double * const, ThrData * const);
+  void ev_tally_list_thr(Pair * const, const int, const int * const,
+                         const double , const double * const , ThrData * const);
+
+};
+
+// set loop range thread id, and force array offset for threaded runs.
+static inline void loop_setup_thr(int &ifrom, int &ito, int &tid,
+                                  int inum, int nthreads, int nproxy=0)
+{
+#if defined(_OPENMP)
+  tid = omp_get_thread_num();
+
+  // each thread works on a fixed chunk of atoms.
+  const int idelta = 1 + inum/(nthreads-nproxy);
+  ifrom = (tid-nproxy)*idelta;
+  ito   = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+  tid = 0;
+  ifrom = 0;
+  ito = inum;
+#endif
+}
+
+}
+#endif