Merge remote-tracking branch 'upstream/master'

2019-06-19 18:52:47 -04:00 · 2019-06-19 18:52:47 -04:00 · 87e6fda820
parent fe1afee84e bb11344424
commit 87e6fda820
123 changed files with 9368 additions and 3009 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -33,7 +33,6 @@ include(LAMMPSUtils)

 get_lammps_version(${LAMMPS_SOURCE_DIR}/version.h LAMMPS_VERSION)

-
 include(PreventInSourceBuilds)

 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
@ -51,6 +50,7 @@ check_for_autogen_files(${LAMMPS_SOURCE_DIR})
 # these need ot be done early (before further tests).
 #####################################################################
 include(CheckCCompilerFlag)
+include(CheckIncludeFileCXX)

 if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
@ -106,6 +106,8 @@ if(BUILD_LIB)
  endif()
 endif()

+option(BUILD_TOOLS "Build and install LAMMPS tools (msi2lmp, binary2txt, chain)" OFF)
+
 if(NOT BUILD_EXE AND NOT BUILD_LIB)
  message(FATAL_ERROR "You need to at least enable one of two following options: BUILD_LIB or BUILD_EXE")
 endif()
@ -186,10 +188,6 @@ if(LAMMPS_EXCEPTIONS)
  set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS")
 endif()

-option(CMAKE_VERBOSE_MAKEFILE "Verbose makefile" OFF)
-
-
-
 # "hard" dependencies between packages resulting
 # in an error instead of skipping over files
 pkg_depends(MPIIO MPI)
@ -198,7 +196,6 @@ pkg_depends(USER-LB MPI)
 pkg_depends(USER-PHONON KSPACE)
 pkg_depends(USER-SCAFACOS MPI)

-include(CheckIncludeFileCXX)
 find_package(OpenMP QUIET)

 # TODO: this is a temporary workaround until a better solution is found. AK 2019-05-30
@ -227,6 +224,9 @@ if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-PLUMED OR PKG_USER-QUI
  find_package(LAPACK)
  find_package(BLAS)
  if(NOT LAPACK_FOUND OR NOT BLAS_FOUND)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      status(FATAL_ERROR "Cannot build internal linear algebra library with Ninja build tool due to lack for Fortran support")
+    endif()
    enable_language(Fortran)
    file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/[^.]*.[fF])
    add_library(linalg STATIC ${LAPACK_SOURCES})
@ -522,6 +522,19 @@ if(BUILD_EXE)
  set_target_properties(lmp PROPERTIES OUTPUT_NAME ${LAMMPS_BINARY})
  install(TARGETS lmp DESTINATION ${CMAKE_INSTALL_BINDIR})
  install(FILES ${LAMMPS_DOC_DIR}/lammps.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1 RENAME ${LAMMPS_BINARY}.1)
+endif()
+
+if(BUILD_TOOLS)
+  add_executable(binary2txt ${LAMMPS_TOOLS_DIR}/binary2txt.cpp)
+  install(TARGETS binary2txt DESTINATION ${CMAKE_INSTALL_BINDIR})
+
+  # ninja-build currently does not support fortran. thus we skip building this tool
+  if(NOT CMAKE_GENERATOR STREQUAL "Ninja")
+    message(STATUS "Skipping building 'chain.x' with Ninja build tool due to lack of Fortran support")
+    enable_language(Fortran)
+    add_executable(chain.x ${LAMMPS_TOOLS_DIR}/chain.f)
+    target_link_libraries(chain.x ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+  endif()

  enable_language(C)
  get_filename_component(MSI2LMP_SOURCE_DIR ${LAMMPS_TOOLS_DIR}/msi2lmp/src ABSOLUTE)
@ -530,7 +543,6 @@ if(BUILD_EXE)
  target_link_libraries(msi2lmp m)
  install(TARGETS msi2lmp DESTINATION ${CMAKE_INSTALL_BINDIR})
  install(FILES ${LAMMPS_DOC_DIR}/msi2lmp.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
-
 endif()

 include(Documentation)
--- a/cmake/Modules/Packages/KIM.cmake
+++ b/cmake/Modules/Packages/KIM.cmake
@ -13,6 +13,9 @@ if(PKG_KIM)
  endif()
  option(DOWNLOAD_KIM "Download KIM-API from OpenKIM instead of using an already installed one" ${DOWNLOAD_KIM_DEFAULT})
  if(DOWNLOAD_KIM)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded KIM-API library with Ninja build tool")
+    endif()
    message(STATUS "KIM-API download requested - we will build our own")
    enable_language(C)
    enable_language(Fortran)
--- a/cmake/Modules/Packages/LATTE.cmake
+++ b/cmake/Modules/Packages/LATTE.cmake
@ -11,6 +11,9 @@ if(PKG_LATTE)
    if (CMAKE_VERSION VERSION_LESS "3.7") # due to SOURCE_SUBDIR
      message(FATAL_ERROR "For downlading LATTE you need at least cmake-3.7")
    endif()
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded LATTE library with Ninja build tool")
+    endif()
    message(STATUS "LATTE download requested - we will build our own")
    include(ExternalProject)
    ExternalProject_Add(latte_build
--- a/cmake/Modules/Packages/MSCG.cmake
+++ b/cmake/Modules/Packages/MSCG.cmake
@ -11,6 +11,9 @@ if(PKG_MSCG)
    if (CMAKE_VERSION VERSION_LESS "3.7") # due to SOURCE_SUBDIR
      message(FATAL_ERROR "For downlading MSCG you need at least cmake-3.7")
    endif()
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded MSCG library with Ninja build tool")
+    endif()
    include(ExternalProject)
    if(NOT LAPACK_FOUND)
      set(EXTRA_MSCG_OPTS "-DLAPACK_LIBRARIES=${CMAKE_CURRENT_BINARY_DIR}/liblinalg.a")
--- a/cmake/Modules/Packages/USER-INTEL.cmake
+++ b/cmake/Modules/Packages/USER-INTEL.cmake
@ -1,5 +1,4 @@
 if(PKG_USER-INTEL)
-  include(CheckIncludeFile)
  check_include_file_cxx(immintrin.h FOUND_IMMINTRIN)
  if(NOT FOUND_IMMINTRIN)
    message(FATAL_ERROR "immintrin.h header not found, Intel package won't work without it")
--- a/cmake/Modules/Packages/USER-PLUMED.cmake
+++ b/cmake/Modules/Packages/USER-PLUMED.cmake
@ -17,6 +17,9 @@ if(PKG_USER-PLUMED)

  option(DOWNLOAD_PLUMED "Download Plumed package instead of using an already installed one" ${DOWNLOAD_PLUMED_DEFAULT})
  if(DOWNLOAD_PLUMED)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded Plumed library with Ninja build tool")
+    endif()
    if(BUILD_MPI)
      set(PLUMED_CONFIG_MPI "--enable-mpi")
      set(PLUMED_CONFIG_CC  ${CMAKE_MPI_C_COMPILER})
--- a/cmake/Modules/Packages/USER-SCAFACOS.cmake
+++ b/cmake/Modules/Packages/USER-SCAFACOS.cmake
@ -13,6 +13,9 @@ if(PKG_USER-SCAFACOS)
  endif()
  option(DOWNLOAD_SCAFACOS "Download ScaFaCoS library instead of using an already installed one" ${DOWNLOAD_SCAFACOS_DEFAULT})
  if(DOWNLOAD_SCAFACOS)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded ScaFaCoS library with Ninja build tool")
+    endif()
    message(STATUS "ScaFaCoS download requested - we will build our own")
    include(ExternalProject)
    ExternalProject_Add(scafacos_build
--- a/cmake/Modules/Packages/VORONOI.cmake
+++ b/cmake/Modules/Packages/VORONOI.cmake
@ -7,6 +7,9 @@ if(PKG_VORONOI)
  endif()
  option(DOWNLOAD_VORO "Download and compile the Voro++ library instead of using an already installed one" ${DOWNLOAD_VORO_DEFAULT})
  if(DOWNLOAD_VORO)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded Voro++ library with Ninja build tool")
+    endif()
    message(STATUS "Voro++ download requested - we will build our own")
    include(ExternalProject)

--- a/doc/lammps.1
+++ b/doc/lammps.1
@ -1,4 +1,4 @@
-.TH LAMMPS "5 June 2019" "2019-06-05"
+.TH LAMMPS "18 June 2019" "2019-06-18"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
--- a/doc/src/Build_cmake.txt
+++ b/doc/src/Build_cmake.txt
@ -32,10 +32,18 @@ cmake \[options ...\] ../cmake     # configuration with (command-line) cmake
 make                             # compilation :pre

 The cmake command will detect available features, enable selected
-packages and options, and will generate the build environment.  The make
-command will then compile and link LAMMPS, producing (by default) an
-executable called "lmp" and a library called "liblammps.a" in the
-"build" folder.
+packages and options, and will generate the build environment.  By default
+this build environment will be created for "Unix Makefiles" on most
+platforms and particularly on Linux.  However, alternate build tools
+(e.g. Ninja) and support files for Integrated Development Environments
+(IDE) like Eclipse, CodeBlocks, or Kate can be generated, too. This is
+selected via the "-G" command line flag. For the rest of the documentation
+we will assume that the build environment is generated for makefiles
+and thus the make command will be used to compile and link LAMMPS as
+indicated above, producing (by default) an executable called "lmp" and
+a library called "liblammps.a" in the "build" folder.  When generating
+a build environment for the "Ninja" build tool, the build command would
+be "ninja" instead of "make".

 If your machine has multiple CPU cores (most do these days), using a
 command like "make -jN" (with N being the number of available local
--- a/doc/src/Eqs/pair_spin_dipole.jpg
+++ b/doc/src/Eqs/pair_spin_dipole.jpg
--- a/doc/src/Eqs/pair_spin_dipole.tex
+++ b/doc/src/Eqs/pair_spin_dipole.tex
@ -0,0 +1,42 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \mathcal{H}_{\rm long}= 
+    -\frac{\mu_{0} \left( \mu_B\right)^2}{4\pi} 
+    \sum_{i,j,i\neq j}^{N}
+    \frac{g_i g_j}{r_{ij}^3}
+    \Big(3 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{i}\right) 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{j}\right) 
+    -\bm{s}_i\cdot\bm{s}_j \Big)
+    \nonumber
+  \end{equation}
+  \begin{equation}
+    \bm{\omega}_i = 
+    \frac{\mu_0 (\mu_B)^2}{4\pi\hbar}\sum_{j}
+    \frac{g_i g_j}{r_{ij}^3}
+    \, \Big(
+    3\,(\bm{e}_{ij}\cdot\bm{s}_{j})\bm{e}_{ij}
+    -\bm{s}_{j} \Big) \nonumber
+  \end{equation}
+  \begin{equation}
+    \bm{F}_i =
+    \frac{3\, \mu_0 (\mu_B)^2}{4\pi} \sum_j
+    \frac{g_i g_j}{r_{ij}^4}
+    \Big[\big( (\bm{s}_i\cdot\bm{s}_j) 
+    -5(\bm{e}_{ij}\cdot\bm{s}_i)
+    (\bm{e}_{ij}\cdot\bm{s}_j)\big) \bm{e}_{ij}+
+    \big(
+    (\bm{e}_{ij}\cdot\bm{s}_i)\bm{s}_j+
+    (\bm{e}_{ij}\cdot\bm{s}_j)\bm{s}_i
+    \big)
+    \Big]
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Eqs/pair_spin_long_range.jpg
+++ b/doc/src/Eqs/pair_spin_long_range.jpg
--- a/doc/src/Eqs/pair_spin_long_range.tex
+++ b/doc/src/Eqs/pair_spin_long_range.tex
@ -0,0 +1,20 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \mathcal{H}_{\rm long}= 
+    -\frac{\mu_{0} \left( \mu_B\right)^2}{4\pi} 
+    \sum_{i,j,i\neq j}^{N}
+    \frac{g_i g_j}{r_{ij}^3}
+    \Big(3 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{i}\right) 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{j}\right) 
+    -\bm{s}_i\cdot\bm{s}_j \Big)
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Eqs/pair_spin_long_range_force.jpg
+++ b/doc/src/Eqs/pair_spin_long_range_force.jpg
--- a/doc/src/Eqs/pair_spin_long_range_force.tex
+++ b/doc/src/Eqs/pair_spin_long_range_force.tex
@ -0,0 +1,23 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \bm{F}_i =
+    \frac{\mu_0 (\mu_B)^2}{4\pi} \sum_j
+    \frac{g_i g_j}{r_{ij}^4}
+    \Big[\big( (\bm{s}_i\cdot\bm{s}_j) 
+    -5(\bm{e}_{ij}\cdot\bm{s}_i)
+    (\bm{e}_{ij}\cdot\bm{s}_j)\big) \bm{e}_{ij}+
+    \big(
+    (\bm{e}_{ij}\cdot\bm{s}_i)\bm{s}_j+
+    (\bm{e}_{ij}\cdot\bm{s}_j)\bm{s}_i
+    \big)
+    \Big]
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Eqs/pair_spin_long_range_magforce.jpg
+++ b/doc/src/Eqs/pair_spin_long_range_magforce.jpg
--- a/doc/src/Eqs/pair_spin_long_range_magforce.tex
+++ b/doc/src/Eqs/pair_spin_long_range_magforce.tex
@ -0,0 +1,17 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \bm{\omega}_i = 
+    \frac{\mu_0 (\mu_B)^2}{4\pi\hbar}\sum_{j}
+    \frac{g_i g_j}{r_{ij}^3}
+    \, \Big(
+    3\,(\bm{e}_{ij}\cdot\bm{s}_{j})\bm{e}_{ij}
+    -\bm{s}_{j} \Big) \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Errors_messages.txt
+++ b/doc/src/Errors_messages.txt
@ -636,12 +636,12 @@ Please ensure reaction map files are properly formatted. :dd

 {Bond/react: Atom affected by reaction too close to template edge} :dt

-This means an atom which changes type during the reaction is too close
-to an 'edge' atom defined in the superimpose file. This could cause
-incorrect assignment of bonds, angle, etc. Generally, this means you
-must include more atoms in your templates, such that there are at
-least two atoms between each atom involved in the reaction and an edge
-atom. :dd
+This means an atom which changes type or connectivity during the
+reaction is too close to an 'edge' atom defined in the superimpose
+file. This could cause incorrect assignment of bonds, angle, etc.
+Generally, this means you must include more atoms in your templates,
+such that there are at least two atoms between each atom involved in
+the reaction and an edge atom. :dd

 {Bond/react: Fix bond/react needs ghost atoms from farther away} :dt

@ -2202,10 +2202,6 @@ Self-explanatory. :dd

 This is a current restriction in LAMMPS. :dd

-{Cannot use pair hybrid with GPU neighbor list builds} :dt
-
-Neighbor list builds must be done on the CPU for this pair style. :dd
-
 {Cannot use pair tail corrections with 2d simulations} :dt

 The correction factors are only currently defined for 3d systems. :dd
@ -5523,10 +5519,6 @@ Self-explanatory. :dd
 For this pair style, you cannot run part of the force calculation on
 the host.  See the package command. :dd

-{GPU split param must be positive for hybrid pair styles} :dt
-
-See the package gpu command. :dd
-
 {GPUs are requested but Kokkos has not been compiled for CUDA} :dt

 Re-compile Kokkos with CUDA support to use GPUs. :dd
--- a/doc/src/Errors_warnings.txt
+++ b/doc/src/Errors_warnings.txt
@ -82,10 +82,14 @@ bond/angle/dihedral.  LAMMPS computes this by taking the maximum bond
 length, multiplying by the number of bonds in the interaction (e.g. 3
 for a dihedral) and adding a small amount of stretch. :dd

-{Bond/react: An atom in 'react #%d' changes bond connectivity but not atom type} :dt
+{Bond/react: Atom affected by reaction too close to template edge} :dt

-You may want to double-check that all atom types are properly assigned
-in the post-reaction template. :dd
+This means an atom which changes type or connectivity during the
+reaction is too close to an 'edge' atom defined in the superimpose
+file. This could cause incorrect assignment of bonds, angle, etc.
+Generally, this means you must include more atoms in your templates,
+such that there are at least two atoms between each atom involved in
+the reaction and an edge atom. :dd

 {Both groups in compute group/group have a net charge; the Kspace boundary correction to energy will be non-zero} :dt

--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="5 Jun 2019 version">
+<META NAME="docnumber" CONTENT="18 Jun 2019 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -21,7 +21,7 @@
 :line

 LAMMPS Documentation :c,h1
-5 Jun 2019 version :c,h2
+18 Jun 2019 version :c,h2

 "What is a LAMMPS version?"_Manual_version.html

--- a/doc/src/Speed_compare.txt
+++ b/doc/src/Speed_compare.txt
@ -104,7 +104,7 @@ code (with a performance penalty due to having data transfers between
 host and GPU). :ulb,l

 The GPU package requires neighbor lists to be built on the CPU when using
-exclusion lists, hybrid pair styles, or a triclinic simulation box. :l
+exclusion lists, or a triclinic simulation box. :l

 The GPU package can be compiled for CUDA or OpenCL and thus supports
 both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically
--- a/doc/src/compute_sna_atom.txt
+++ b/doc/src/compute_sna_atom.txt
@ -24,12 +24,7 @@ twojmax = band limit for bispectrum components (non-negative integer) :l
 R_1, R_2,... = list of cutoff radii, one for each type (distance units) :l
 w_1, w_2,... = list of neighbor weights, one for each type  :l
 zero or more keyword/value pairs may be appended :l
-keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
-  {diagonal} value = {0} or {1} or {2} or {3}
-     {0} = all j1, j2, j <= twojmax, j2 <= j1
-     {1} = subset satisfying j1 == j2
-     {2} = subset satisfying j1 == j2 == j3
-     {3} = subset satisfying j2 <= j1 <= j
+keyword = {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
  {rmin0} value = parameter in distance to angle conversion (distance units)
  {switchflag} value = {0} or {1}
     {0} = do not use switching function
@ -44,7 +39,7 @@ keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag

 [Examples:]

-compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 diagonal 3 rmin0 0.0
+compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 rmin0 0.0
 compute db all sna/atom 1.4 0.95 6 2.0 1.0
 compute vb all sna/atom 1.4 0.95 6 2.0 1.0 :pre

@ -151,7 +146,7 @@ The argument {rfac0} and the optional keyword {rmin0} define the
 linear mapping from radial distance to polar angle {theta0} on the
 3-sphere.

-The argument {twojmax} and the keyword {diagonal} define which
+The argument {twojmax} defines which
 bispectrum components are generated. See section below on output for a
 detailed explanation of the number of bispectrum components and the
 ordered in which they are listed.
@ -192,23 +187,18 @@ command that includes all pairs in the neighbor list.
 Compute {sna/atom} calculates a per-atom array, each column
 corresponding to a particular bispectrum component.  The total number
 of columns and the identity of the bispectrum component contained in
-each column depend on the values of {twojmax} and {diagonal}, as
+each column depend of the value of {twojmax}, as
 described by the following piece of python code:

 for j1 in range(0,twojmax+1):
-    if(diagonal==2):
-        print j1/2.,j1/2.,j1/2.
-    elif(diagonal==1):
-        for j in range(0,min(twojmax,2*j1)+1,2):
-            print j1/2.,j1/2.,j/2.
-    elif(diagonal==0):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                print j1/2.,j2/2.,j/2.
-    elif(diagonal==3):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                if (j>=j1): print j1/2.,j2/2.,j/2. :pre
+    for j2 in range(0,j1+1):
+        for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
+            if (j>=j1): print j1/2.,j2/2.,j/2. :pre
+
+NOTE: the {diagonal} keyword allowing other possible choices
+for the number of bispectrum components was removed in 2019, 
+since all potentials use the value of 3, corresponding to the
+above set of bispectrum components.

 Compute {snad/atom} evaluates a per-atom array. The columns are
 arranged into {ntypes} blocks, listed in order of atom type {I}.  Each
@ -259,7 +249,7 @@ package"_Build_package.html doc page for more info.

 [Default:]

-The optional keyword defaults are {diagonal} = 0, {rmin0} = 0,
+The optional keyword defaults are {rmin0} = 0,
 {switchflag} = 1, {bzeroflag} = 1, {quadraticflag} = 0,

 :line
--- a/doc/src/compute_tally.txt
+++ b/doc/src/compute_tally.txt
@ -88,6 +88,8 @@ potentials only include the pair potential portion of the EAM
 interaction when used by this compute, not the embedding term.  Also
 bonded or Kspace interactions do not contribute to this compute.

+The computes in this package are not compatible with dynamic groups.
+
 [Related commands:]

 {compute group/group}_compute_group_group.html, {compute
--- a/doc/src/kspace_modify.txt
+++ b/doc/src/kspace_modify.txt
@ -392,7 +392,8 @@ boundaries can be set using "boundary"_boundary.html (the slab
 approximation in not needed).  The {slab} keyword is not currently
 supported by Ewald or PPPM when using a triclinic simulation cell. The
 slab correction has also been extended to point dipole interactions
-"(Klapp)"_#Klapp in "kspace_style"_kspace_style.html {ewald/disp}.
+"(Klapp)"_#Klapp in "kspace_style"_kspace_style.html {ewald/disp},
+{ewald/dipole}, and {pppm/dipole}.

 NOTE: If you wish to apply an electric field in the Z-direction, in
 conjunction with the {slab} keyword, you should do it by adding
--- a/doc/src/kspace_style.txt
+++ b/doc/src/kspace_style.txt
@ -20,6 +20,10 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
    accuracy = desired relative error in forces
  {ewald/omp} value = accuracy
    accuracy = desired relative error in forces
+  {ewald/dipole} value = accuracy
+    accuracy = desired relative error in forces
+  {ewald/dipole/spin} value = accuracy
+    accuracy = desired relative error in forces
  {pppm} value = accuracy
    accuracy = desired relative error in forces
  {pppm/cg} values = accuracy (smallq)
@ -47,6 +51,10 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
    accuracy = desired relative error in forces
  {pppm/stagger} value = accuracy
    accuracy = desired relative error in forces
+  {pppm/dipole} value = accuracy
+    accuracy = desired relative error in forces
+  {pppm/dipole/spin} value = accuracy
+    accuracy = desired relative error in forces
  {msm} value = accuracy
    accuracy = desired relative error in forces
  {msm/cg} value = accuracy (smallq)
@ -105,9 +113,15 @@ The {ewald/disp} style adds a long-range dispersion sum option for
 but in a more efficient manner than the {ewald} style.  The 1/r^6
 capability means that Lennard-Jones or Buckingham potentials can be
 used without a cutoff, i.e. they become full long-range potentials.
-The {ewald/disp} style can also be used with point-dipoles
-"(Toukmaji)"_#Toukmaji and is currently the only kspace solver in
-LAMMPS with this capability.
+The {ewald/disp} style can also be used with point-dipoles, see
+"(Toukmaji)"_#Toukmaji.
+
+The {ewald/dipole} style adds long-range standard Ewald summations 
+for dipole-dipole interactions, see "(Toukmaji)"_#Toukmaji.
+
+The {ewald/dipole/spin} style adds long-range standard Ewald 
+summations for magnetic dipole-dipole interactions between
+magnetic spins.

 :line

@ -128,6 +142,12 @@ The optional {smallq} argument defines the cutoff for the absolute
 charge value which determines whether a particle is considered charged
 or not.  Its default value is 1.0e-5.

+The {pppm/dipole} style invokes a particle-particle particle-mesh solver 
+for dipole-dipole interactions, following the method of "(Cerda)"_#Cerda2008.
+
+The {pppm/dipole/spin} style invokes a particle-particle particle-mesh solver 
+for magnetic dipole-dipole interactions between magnetic spins. 
+
 The {pppm/tip4p} style is identical to the {pppm} style except that it
 adds a charge at the massless 4th site in each TIP4P water molecule.
 It should be used with "pair styles"_pair_style.html with a
@ -317,7 +337,10 @@ using ideas from chapter 3 of "(Hardy)"_#Hardy2006, with equation 3.197
 of particular note. When using {msm} with non-periodic boundary
 conditions, it is expected that the error estimation will be too
 pessimistic. RMS force errors for dipoles when using {ewald/disp}
-are estimated using equations 33 and 46 of "(Wang)"_#Wang.
+or {ewald/dipole} are estimated using equations 33 and 46 of
+"(Wang)"_#Wang. The RMS force errors for {pppm/dipole} are estimated
+using the equations in "(Cerda)"_#Cerda2008.
+

 See the "kspace_modify"_kspace_modify.html command for additional
 options of the K-space solvers that can be set, including a {force}
@ -464,6 +487,9 @@ Illinois at Urbana-Champaign, (2006).
 :link(Sutmann2013)
 [(Sutmann)] Sutmann, Arnold, Fahrenberger, et. al., Physical review / E 88(6), 063308 (2013)

+:link(Cerda2008)
+[(Cerda)] Cerda, Ballenegger, Lenz, Holm, J Chem Phys 129, 234104 (2008)
+
 :link(Who2012)
 [(Who)] Who, Author2, Author3, J of Long Range Solvers, 35, 164-177
 (2012).
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -176,12 +176,10 @@ computation will be built.  If {neigh} is {yes}, which is the default,
 neighbor list building is performed on the GPU.  If {neigh} is {no},
 neighbor list building is performed on the CPU.  GPU neighbor list
 building currently cannot be used with a triclinic box.  GPU neighbor
-list calculation currently cannot be used with
-"hybrid"_pair_hybrid.html pair styles.  GPU neighbor lists are not
-compatible with commands that are not GPU-enabled.  When a non-GPU
-enabled command requires a neighbor list, it will also be built on the
-CPU.  In these cases, it will typically be more efficient to only use
-CPU neighbor list builds.
+lists are not compatible with commands that are not GPU-enabled.  When
+a non-GPU enabled command requires a neighbor list, it will also be
+built on the CPU.  In these cases, it will typically be more efficient
+to only use CPU neighbor list builds.

 The {newton} keyword sets the Newton flags for pairwise (not bonded)
 interactions to {off} or {on}, the same as the "newton"_newton.html
--- a/doc/src/pair_class2.txt
+++ b/doc/src/pair_class2.txt
@ -155,9 +155,12 @@ All of the lj/class2 pair styles write their information to "binary
 restart files"_restart.html, so pair_style and pair_coeff commands do
 not need to be specified in an input script that reads a restart file.

-All of the lj/class2 pair styles can only be used via the {pair}
-keyword of the "run_style respa"_run_style.html command.  They do not
-support the {inner}, {middle}, {outer} keywords.
+Only the {lj/class2} pair style support the use of the
+{inner}, {middle}, and {outer} keywords of the "run_style
+respa"_run_style.html command, meaning the pairwise forces can be
+partitioned by distance at different levels of the rRESPA hierarchy.
+The other styles only support the {pair} keyword of run_style respa.
+See the "run_style"_run_style.html command for details.

 [Restrictions:]

--- a/doc/src/pair_snap.txt
+++ b/doc/src/pair_snap.txt
@ -38,7 +38,7 @@ where {B_k^i} is the {k}-th bispectrum component of atom {i},
 and {beta_k^alpha_i} is the corresponding linear coefficient
 that depends on {alpha_i}, the SNAP element of atom {i}. The
 number of bispectrum components used and their definitions
-depend on the values of {twojmax} and {diagonalstyle}
+depend on the value of {twojmax}
 defined in the SNAP parameter file described below.
 The bispectrum calculation is described in more detail
 in "compute sna/atom"_compute_sna_atom.html.
@ -125,14 +125,13 @@ This line is followed by {ncoeff} coefficients, one per line.
 The SNAP parameter file can contain blank and comment lines (start
 with #) anywhere. Each non-blank non-comment line must contain one
 keyword/value pair. The required keywords are {rcutfac} and
-{twojmax}. Optional keywords are {rfac0}, {rmin0}, {diagonalstyle},
+{twojmax}. Optional keywords are {rfac0}, {rmin0},
 {switchflag}, and {bzeroflag}.

 The default values for these keywords are

 {rfac0} = 0.99363
 {rmin0} = 0.0
-{diagonalstyle} = 3
 {switchflag} = 0
 {bzeroflag} = 1
 {quadraticflag} = 1 :ul
@ -144,6 +143,9 @@ If {quadraticflag} is set to 1, then the SNAP energy expression includes the qua
 The SNAP element file should contain {K}({K}+1)/2 additional coefficients
 for each element, the upper-triangular elements of alpha.

+NOTE: The previously used {diagonalstyle} keyword was removed in 2019,
+since all known SNAP potentials use the default value of 3.
+
 :line

 [Mixing, shift, table, tail correction, restart, rRESPA info]:
--- a/doc/src/pair_spin_dipole.txt
+++ b/doc/src/pair_spin_dipole.txt
@ -0,0 +1,89 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Commands_all.html)
+
+:line
+
+pair_style spin/dipole/cut command :h3
+pair_style spin/dipole/long command :h3
+
+[Syntax:]
+
+pair_style spin/dipole/cut cutoff 
+pair_style spin/dipole/long cutoff :pre
+
+cutoff = global cutoff for magnetic dipole energy and forces
+(optional) (distance units) :ulb,l
+:ule
+
+[Examples:]
+
+pair_style spin/dipole/cut 10.0
+pair_coeff * * 10.0 
+pair_coeff 2 3 8.0 :pre
+
+pair_style spin/dipole/long 9.0
+pair_coeff * * 1.0 1.0
+pair_coeff 2 3 1.0 1.0 2.5 4.0 scale 0.5
+pair_coeff 2 3 1.0 1.0 2.5 4.0 :pre
+
+[Description:]
+
+Style {spin/dipole/cut} computes a short-range dipole-dipole
+interaction between pairs of magnetic particles that each 
+have a magnetic spin. 
+The magnetic dipole-dipole interactions are computed by the
+following formulas for the magnetic energy, magnetic precession 
+vector omega and mechanical force between particles I and J.
+
+:c,image(Eqs/pair_spin_dipole.jpg)
+
+where si and sj are the spin on two magnetic particles, 
+r is their separation distance, and the vector e = (Ri - Rj)/|Ri - Rj| 
+is the direction vector between the two particles.  
+
+Style {spin/dipole/long} computes long-range magnetic dipole-dipole
+interaction.
+A "kspace_style"_kspace_style.html must be defined to
+use this pair style.  Currently, "kspace_style 
+ewald/dipole/spin"_kspace_style.html and "kspace_style
+pppm/dipole/spin"_kspace_style.html support long-range magnetic 
+dipole-dipole interactions.
+
+:line
+
+The "pair_modify"_pair_modify.html table option is not relevant
+for this pair style.
+
+This pair style does not support the "pair_modify"_pair_modify.html
+tail option for adding long-range tail corrections to energy and
+pressure.
+
+This pair style writes its information to "binary restart
+files"_restart.html, so pair_style and pair_coeff commands do not need
+to be specified in an input script that reads a restart file.
+
+[Restrictions:]
+
+The {spin/dipole/cut} and {spin/dipole/long} styles are part of
+the SPIN package.  They are only enabled if LAMMPS was built with that 
+package.  See the "Build package"_Build_package.html doc page for more 
+info.
+
+Using dipole/spin pair styles with {electron} "units"_units.html is not
+currently supported.
+
+[Related commands:]
+
+"pair_coeff"_pair_coeff.html, "kspace_style"_kspace_style.html
+"fix nve/spin"_fix_nve_spin.html
+
+[Default:] none
+
+:line
+
+:link(Allen2)
+[(Allen)] Allen and Tildesley, Computer Simulation of Liquids,
+Clarendon Press, Oxford, 1987.
--- a/doc/src/variable.txt
+++ b/doc/src/variable.txt
@ -52,7 +52,7 @@ style = {delete} or {index} or {loop} or {world} or {universe} or {uloop} or {st
                     sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x),
                     random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x)
                     ramp(x,y), stagger(x,y), logfreq(x,y,z), logfreq2(x,y,z),
-                     stride(x,y,z), stride2(x,y,z,a,b,c),
+                     logfreq3(x,y,z), stride(x,y,z), stride2(x,y,z,a,b,c),
                     vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)
    group functions = count(group), mass(group), charge(group),
                      xcm(group,dim), vcm(group,dim), fcm(group,dim),
@ -459,8 +459,8 @@ Math functions: sqrt(x), exp(x), ln(x), log(x), abs(x), \
     sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x), \
     random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x), \
     ramp(x,y), stagger(x,y), logfreq(x,y,z), logfreq2(x,y,z), \
-     stride(x,y,z), stride2(x,y,z,a,b,c), vdisplace(x,y), \
-     swiggle(x,y,z), cwiggle(x,y,z)
+     logfreq3(x,y,z), stride(x,y,z), stride2(x,y,z,a,b,c), \
+     vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)
 Group functions: count(ID), mass(ID), charge(ID), xcm(ID,dim), \
     vcm(ID,dim), fcm(ID,dim), bound(ID,dir), \
     gyration(ID), ke(ID), angmom(ID,dim), torque(ID,dim), \
@ -670,6 +670,16 @@ sequence of output timesteps:

 100,150,200,...950,1000,1500,2000,...9500,10000,15000,etc :pre

+The logfreq3(x,y,z) function generates y points between x and z (inclusive),
+that are separated by a multiplicative ratio: (z/x)^(1/(y-1)). Constraints
+are: x,z > 0, y > 1, z-x >= y-1. For eg., if logfreq3(10,25,1000) is used in
+a variable by the "fix print"_fix_print.html command, then the interval
+between 10 and 1000 is divided into 24 parts with a multiplicative
+separation of ~1.21, and it will generate the following sequence of output
+timesteps:
+
+10, 13, 15, 18, 22, 27, 32,...384, 465, 563, 682, 826, 1000 :pre
+
 The stride(x,y,z) function uses the current timestep to generate a new
 timestep.  X,y >= 0 and z > 0 and x <= y are required.  The generated
 timesteps increase in increments of z, from x to y, i.e. it generates
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -1408,6 +1408,7 @@ Lenart
 lennard
 Lennard
 Lenosky
+Lenz
 Lett
 Leuven
 Leven
--- a/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
+++ b/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
@ -25,16 +25,18 @@ velocity 	all create 100 4928459 rot yes dist gaussian

 #pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0 spin/neel 4.0
 pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0
-pair_coeff 	* * eam/alloy Co_PurjaPun_2012.eam.alloy Co
-pair_coeff 	* * spin/exchange exchange 4.0 0.3593 1.135028015e-05 1.064568567
+pair_coeff 	* * eam/alloy ../examples/SPIN/cobalt_hcp/Co_PurjaPun_2012.eam.alloy Co
+#pair_coeff 	* * eam/alloy Co_PurjaPun_2012.eam.alloy Co
+pair_coeff 	* * spin/exchange exchange 4.0 -0.3593 1.135028015e-05 1.064568567
 #pair_coeff 	* * spin/neel neel 4.0 0.0048 0.234 1.168 2.6905 0.705 0.652  

 neighbor 	0.1 bin
 neigh_modify 	every 10 check yes delay 20

 #fix 		1 all precession/spin zeeman 1.0 0.0 0.0 1.0
-fix 		1 all precession/spin zeeman 0.0 0.0 0.0 1.0
-fix 		2 all langevin/spin 0.0 0.0 21
+fix 		1 all precession/spin anisotropy 0.01 0.0 0.0 1.0
+#fix 		2 all langevin/spin 0.0 0.0 21
+fix 		2 all langevin/spin 0.0 0.1 21
 fix 		3 all nve/spin lattice yes

 timestep	0.0001
--- a/examples/SPIN/dipole_spin/Fe_Mishin2006.eam.alloy
+++ b/examples/SPIN/dipole_spin/Fe_Mishin2006.eam.alloy
@ -0,0 +1 @@
+../iron/Fe_Mishin2006.eam.alloy
--- a/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_bcc_iron.dat
+++ b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_bcc_iron.dat
@ -0,0 +1,5 @@
+2.4824 0.01948336
+2.8665 0.01109
+4.0538 -0.0002176
+4.753 -0.001714
+4.965 -0.001986
--- a/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_fit.py
+++ b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_fit.py
@ -0,0 +1,32 @@
+#Program fitting the exchange interaction
+#Model curve: Bethe-Slater function
+import numpy as np, pylab, tkinter
+import matplotlib.pyplot as plt
+from scipy.optimize import curve_fit
+from decimal import *
+
+print("Loop begin")
+
+#Definition of the Bethe-Slater function
+def func(x,a,b,c):
+    return 4*a*((x/c)**2)*(1-b*(x/c)**2)*np.exp(-(x/c)**2)
+
+#Exchange coeff table (data to fit)
+rdata, Jdata = np.loadtxt('exchange_bcc_iron.dat', usecols=(0,1), unpack=True)
+plt.plot(rdata, Jdata, 'b-', label='data')
+
+#Perform the fit
+popt, pcov = curve_fit(func, rdata, Jdata, bounds=(0, [500.,5.,5.]))
+plt.plot(rdata, func(rdata, *popt), 'r--', label='fit')
+
+#Print the fitted params
+print("Parameters: a={:.10} (in meV), b={:.10} (adim), c={:.10} (in Ang)".format(*popt))
+
+#Ploting the result
+plt.xlabel('r_ij')
+pylab.xlim([0,6.5])
+plt.ylabel('J_ij')
+plt.legend()
+plt.show()
+
+print("Loop end")
--- a/examples/SPIN/dipole_spin/fe_dd.dat
+++ b/examples/SPIN/dipole_spin/fe_dd.dat
@ -0,0 +1,19 @@
+            6            8
+ Optimal parameter set
+            1    4.100199340884814       F
+            2    1.565647547483517       F
+            1   0.9332056681088162       T    3.000000000000000     
+            2   -1.162558782567700       T    2.866666666666670     
+            3  -0.3502026949249225       T    2.733333333333330     
+            4   0.4287820835430028       T    2.600000000000000     
+            5    4.907925057809273       T    2.400000000000000     
+            6   -5.307049068415304       T    2.300000000000000     
+            1  -0.1960674387419232       F    4.100000000000000     
+            2   0.3687525935422963       F    3.800000000000000     
+            3   -1.505333614924853       F    3.500000000000000     
+            4    4.948907078156191       T    3.200000000000000     
+            5   -4.894613262753399       T    2.900000000000000     
+            6    3.468897724782442       T    2.600000000000000     
+            7   -1.792218099820337       T    2.400000000000000     
+            8    80.22069592246987       T    2.300000000000000     
+
--- a/examples/SPIN/dipole_spin/in.spin.iron_dipole_cut
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_cut
@ -0,0 +1,59 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/cut 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff      * * spin/dipole/cut 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
--- a/examples/SPIN/dipole_spin/in.spin.iron_dipole_ewald
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_ewald
@ -0,0 +1,61 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/long 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff	* * spin/dipole/long 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+kspace_style 	ewald/dipole/spin 1.0e-4
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
--- a/examples/SPIN/dipole_spin/in.spin.iron_dipole_pppm
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_pppm
@ -0,0 +1,62 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/long 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff	* * spin/dipole/long 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+kspace_style 	pppm/dipole/spin 1.0e-4
+kspace_modify 	compute yes
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
--- a/examples/SPIN/iron/in.spin.iron
+++ b/examples/SPIN/iron/in.spin.iron
@ -19,7 +19,8 @@ create_atoms 	1 box

 mass		1 55.845

-set 		group all spin/random 31 2.2
+#set 		group all spin/random 31 2.2
+set 		group all spin 2.2 0.0 0.0 1.0
 velocity 	all create 100 4928459 rot yes dist gaussian

 pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5
--- a/examples/USER/misc/kolmogorov_crespi_full/CH.rebo
+++ b/examples/USER/misc/kolmogorov_crespi_full/CH.rebo
@ -0,0 +1 @@
+../../../../potentials/CH.rebo
--- a/examples/USER/misc/kolmogorov_crespi_full/in.bilayer-graphene
+++ b/examples/USER/misc/kolmogorov_crespi_full/in.bilayer-graphene
@ -18,7 +18,7 @@ group adsorbate type 2
 ######################## Potential defition ########################
 pair_style  hybrid/overlay  rebo  kolmogorov/crespi/full 16.0
 ####################################################################
-pair_coeff  * * rebo                     CH.airebo      NULL C  # chemical
+pair_coeff  * * rebo                     CH.rebo      NULL C  # chemical
 pair_coeff  * * kolmogorov/crespi/full   CC.KC-full     C    C  # long range
 ####################################################################
 # Neighbor update settings
--- a/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.1
+++ b/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.1
@ -1,4 +1,5 @@
-LAMMPS (8 Mar 2018)
+LAMMPS (5 Jun 2019)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:88)
  using 1 OpenMP thread(s) per MPI task
 # Initialization
 units           metal
@ -21,6 +22,8 @@ read_data       Bi_gr_AB_stack_2L_noH_300K.data
  0 = max # of 1-3 neighbors
  0 = max # of 1-4 neighbors
  1 = max # of special neighbors
+  special bonds CPU = 0.000353813 secs
+  read_data CPU = 0.0043292 secs
 mass            1 12.0107   # carbon mass (g/mole) | membrane
 mass            2 12.0107   # carbon mass (g/mole) | adsorbate
 # Separate atom groups
@ -32,8 +35,8 @@ group adsorbate type 2
 ######################## Potential defition ########################
 pair_style  hybrid/overlay  rebo  kolmogorov/crespi/full 16.0
 ####################################################################
-pair_coeff  * * rebo                     CH.airebo      NULL C  # chemical
-Reading potential file CH.airebo with DATE: 2011-10-25
+pair_coeff  * * rebo                     CH.rebo      NULL C  # chemical
+Reading potential file CH.rebo with DATE: 2018-7-3
 pair_coeff  * * kolmogorov/crespi/full   CC.KC-full     C    C  # long range
 ####################################################################
 # Neighbor update settings
@ -92,32 +95,32 @@ Neighbor list info ...
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 16.96 | 16.96 | 16.96 Mbytes
 Step TotEng PotEng KinEng v_REBO v_KC Temp v_adsxcom v_adsycom v_adszcom v_adsvxcom v_adsvycom v_adsvzcom 
-       0 -5025.3867722725 -5040.0767391239 14.6899668514 -5011.2636297759 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
-     100 -5025.3962433293 -5041.3829775585 15.9867342292 -5012.5109377234 -28.8720398351 91.0071804888 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069014 
-     200 -5025.3942568861 -5041.9638220670 16.5695651809 -5012.7804299195 -29.1833921475 94.3250439654 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265577 
-     300 -5025.3919463074 -5040.9705419367 15.5785956293 -5012.0510295102 -28.9195124265 88.6837826830 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745908 
-     400 -5025.3965376948 -5041.6929964127 16.2964587179 -5012.6418090677 -29.0511873450 92.7703393702 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704496 
-     500 -5025.4050172900 -5042.1712310053 16.7662137153 -5013.1850218645 -28.9862091408 95.4444989087 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
-     600 -5025.3985715734 -5041.2158947893 15.8173232159 -5012.4875319345 -28.7283628548 90.0427797270 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001093 
-     700 -5025.3997561572 -5041.6276721306 16.2279159734 -5012.7093581188 -28.9183140118 92.3801482386 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311564 
-     800 -5025.3967603736 -5042.3401685987 16.9434082251 -5013.3044877099 -29.0356808888 96.4532085367 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492925 
-     900 -5025.3984542801 -5042.2820667481 16.8836124680 -5013.4066841442 -28.8753826039 96.1128111061 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648638 
-    1000 -5025.3988185618 -5041.9160822433 16.5172636815 -5012.8147737982 -29.1013084450 94.0273088606 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
-Loop time of 156.142 on 1 procs for 1000 steps with 1360 atoms
+       0 -5025.3867727863 -5040.0767396377 14.6899668514 -5011.2636302897 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
+     100 -5025.3962438431 -5041.3829780735 15.9867342304 -5012.5109382383 -28.8720398352 91.0071804956 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069015 
+     200 -5025.3942574000 -5041.9638225847 16.5695651847 -5012.7804304371 -29.1833921476 94.3250439874 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265578 
+     300 -5025.3919468212 -5040.9705424499 15.5785956286 -5012.0510300232 -28.9195124266 88.6837826792 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745907 
+     400 -5025.3965382086 -5041.6929969192 16.2964587107 -5012.6418095739 -29.0511873454 92.7703393292 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704497 
+     500 -5025.4050178038 -5042.1712315208 16.7662137170 -5013.1850223792 -28.9862091417 95.4444989189 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
+     600 -5025.3985720873 -5041.2158953052 15.8173232179 -5012.4875324499 -28.7283628553 90.0427797386 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001092 
+     700 -5025.3997566711 -5041.6276726420 16.2279159709 -5012.7093586298 -28.9183140122 92.3801482242 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311565 
+     800 -5025.3967608874 -5042.3401691104 16.9434082230 -5013.3044882226 -29.0356808878 96.4532085250 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492926 
+     900 -5025.3984547937 -5042.2820672614 16.8836124676 -5013.4066846579 -28.8753826035 96.1128111040 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648640 
+    1000 -5025.3988190757 -5041.9160827657 16.5172636900 -5012.8147743212 -29.1013084444 94.0273089090 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
+Loop time of 103.724 on 1 procs for 1000 steps with 1360 atoms

-Performance: 0.553 ns/day, 43.373 hours/ns, 6.404 timesteps/s
-99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
+Performance: 0.833 ns/day, 28.812 hours/ns, 9.641 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 155.99     | 155.99     | 155.99     |   0.0 | 99.90
-Bond    | 0.00075769 | 0.00075769 | 0.00075769 |   0.0 |  0.00
+Pair    | 103.59     | 103.59     | 103.59     |   0.0 | 99.87
+Bond    | 0.00022388 | 0.00022388 | 0.00022388 |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.084217   | 0.084217   | 0.084217   |   0.0 |  0.05
-Output  | 0.0016122  | 0.0016122  | 0.0016122  |   0.0 |  0.00
-Modify  | 0.034797   | 0.034797   | 0.034797   |   0.0 |  0.02
-Other   |            | 0.02838    |            |       |  0.02
+Comm    | 0.082476   | 0.082476   | 0.082476   |   0.0 |  0.08
+Output  | 0.0010884  | 0.0010884  | 0.0010884  |   0.0 |  0.00
+Modify  | 0.032938   | 0.032938   | 0.032938   |   0.0 |  0.03
+Other   |            | 0.01749    |            |       |  0.02

 Nlocal:    1360 ave 1360 max 1360 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@ -133,4 +136,4 @@ Ave neighs/atom = 195.004
 Ave special neighs/atom = 0
 Neighbor list builds = 0
 Dangerous builds = 0
-Total wall time: 0:02:36
+Total wall time: 0:01:43
--- a/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.4
+++ b/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.4
@ -1,4 +1,5 @@
-LAMMPS (8 Mar 2018)
+LAMMPS (5 Jun 2019)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:88)
  using 1 OpenMP thread(s) per MPI task
 # Initialization
 units           metal
@ -21,6 +22,8 @@ read_data       Bi_gr_AB_stack_2L_noH_300K.data
  0 = max # of 1-3 neighbors
  0 = max # of 1-4 neighbors
  1 = max # of special neighbors
+  special bonds CPU = 0.000187874 secs
+  read_data CPU = 0.00234103 secs
 mass            1 12.0107   # carbon mass (g/mole) | membrane
 mass            2 12.0107   # carbon mass (g/mole) | adsorbate
 # Separate atom groups
@ -32,8 +35,8 @@ group adsorbate type 2
 ######################## Potential defition ########################
 pair_style  hybrid/overlay  rebo  kolmogorov/crespi/full 16.0
 ####################################################################
-pair_coeff  * * rebo                     CH.airebo      NULL C  # chemical
-Reading potential file CH.airebo with DATE: 2011-10-25
+pair_coeff  * * rebo                     CH.rebo      NULL C  # chemical
+Reading potential file CH.rebo with DATE: 2018-7-3
 pair_coeff  * * kolmogorov/crespi/full   CC.KC-full     C    C  # long range
 ####################################################################
 # Neighbor update settings
@ -92,32 +95,32 @@ Neighbor list info ...
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 11.13 | 11.13 | 11.13 Mbytes
 Step TotEng PotEng KinEng v_REBO v_KC Temp v_adsxcom v_adsycom v_adszcom v_adsvxcom v_adsvycom v_adsvzcom 
-       0 -5025.3867722725 -5040.0767391239 14.6899668514 -5011.2636297759 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
-     100 -5025.3962433293 -5041.3829775585 15.9867342292 -5012.5109377234 -28.8720398351 91.0071804888 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069014 
-     200 -5025.3942568861 -5041.9638220670 16.5695651809 -5012.7804299195 -29.1833921475 94.3250439654 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265577 
-     300 -5025.3919463074 -5040.9705419367 15.5785956293 -5012.0510295103 -28.9195124265 88.6837826830 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745908 
-     400 -5025.3965376948 -5041.6929964127 16.2964587179 -5012.6418090677 -29.0511873450 92.7703393702 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704496 
-     500 -5025.4050172900 -5042.1712310053 16.7662137153 -5013.1850218645 -28.9862091408 95.4444989088 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
-     600 -5025.3985715734 -5041.2158947893 15.8173232159 -5012.4875319345 -28.7283628548 90.0427797270 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001093 
-     700 -5025.3997561572 -5041.6276721306 16.2279159734 -5012.7093581188 -28.9183140118 92.3801482386 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311564 
-     800 -5025.3967603736 -5042.3401685987 16.9434082251 -5013.3044877099 -29.0356808888 96.4532085367 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492925 
-     900 -5025.3984542801 -5042.2820667481 16.8836124680 -5013.4066841442 -28.8753826039 96.1128111061 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648638 
-    1000 -5025.3988185618 -5041.9160822433 16.5172636815 -5012.8147737983 -29.1013084450 94.0273088606 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
-Loop time of 42.5422 on 4 procs for 1000 steps with 1360 atoms
+       0 -5025.3867727863 -5040.0767396377 14.6899668514 -5011.2636302897 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
+     100 -5025.3962438431 -5041.3829780735 15.9867342304 -5012.5109382383 -28.8720398352 91.0071804956 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069015 
+     200 -5025.3942574000 -5041.9638225847 16.5695651847 -5012.7804304371 -29.1833921476 94.3250439874 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265578 
+     300 -5025.3919468212 -5040.9705424499 15.5785956286 -5012.0510300232 -28.9195124266 88.6837826792 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745907 
+     400 -5025.3965382086 -5041.6929969192 16.2964587107 -5012.6418095739 -29.0511873454 92.7703393291 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704497 
+     500 -5025.4050178038 -5042.1712315208 16.7662137170 -5013.1850223792 -28.9862091417 95.4444989189 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
+     600 -5025.3985720873 -5041.2158953052 15.8173232179 -5012.4875324499 -28.7283628553 90.0427797386 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001092 
+     700 -5025.3997566711 -5041.6276726420 16.2279159709 -5012.7093586298 -28.9183140122 92.3801482242 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311565 
+     800 -5025.3967608874 -5042.3401691104 16.9434082230 -5013.3044882226 -29.0356808878 96.4532085250 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492926 
+     900 -5025.3984547938 -5042.2820672614 16.8836124676 -5013.4066846579 -28.8753826035 96.1128111040 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648640 
+    1000 -5025.3988190757 -5041.9160827657 16.5172636900 -5012.8147743212 -29.1013084444 94.0273089090 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
+Loop time of 33.7338 on 4 procs for 1000 steps with 1360 atoms

-Performance: 2.031 ns/day, 11.817 hours/ns, 23.506 timesteps/s
-98.9% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 2.561 ns/day, 9.370 hours/ns, 29.644 timesteps/s
+94.1% CPU use with 4 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 39.928     | 40.992     | 42.377     |  15.8 | 96.36
-Bond    | 0.0003643  | 0.00043392 | 0.00048113 |   0.0 |  0.00
+Pair    | 30.833     | 31.356     | 32.18      |   9.1 | 92.95
+Bond    | 0.00026059 | 0.00029182 | 0.00031185 |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.12253    | 1.5076     | 2.5698     |  82.1 |  3.54
-Output  | 0.0012577  | 0.0013637  | 0.0016453  |   0.4 |  0.00
-Modify  | 0.010833   | 0.012247   | 0.013317   |   0.9 |  0.03
-Other   |            | 0.02864    |            |       |  0.07
+Comm    | 1.443      | 2.2722     | 2.8091     |  34.3 |  6.74
+Output  | 0.00068855 | 0.00095087 | 0.0017185  |   0.0 |  0.00
+Modify  | 0.010187   | 0.011709   | 0.015284   |   1.9 |  0.03
+Other   |            | 0.09241    |            |       |  0.27

 Nlocal:    340 ave 344 max 334 min
 Histogram: 1 0 0 0 0 0 1 0 1 1
@ -133,4 +136,4 @@ Ave neighs/atom = 195.004
 Ave special neighs/atom = 0
 Neighbor list builds = 0
 Dangerous builds = 0
-Total wall time: 0:00:42
+Total wall time: 0:00:33
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -64,9 +64,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -65,9 +65,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -66,9 +66,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -65,9 +65,13 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom,true);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -71,12 +71,15 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,true,
-                           1);
+  int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,true,1);
+  if (success!=0)
+    return success;
+  
  ucl_device=device->gpu;
  atom=&device->atom;

--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -78,9 +78,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -246,11 +246,8 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                  const bool rot, const int nlocal,
-                  const int host_nlocal, const int nall,
-                  Neighbor *nbor, const int maxspecial,
-                  const int gpu_host, const int max_nbors,
-                  const double cell_size, const bool pre_cut,
-                  const int threads_per_atom, const bool vel) {
+                  const int nall, const int maxspecial,
+                  const bool vel) {
  if (!_device_init)
    return -1;
  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@ -301,16 +298,6 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
  if (!ans.init(ef_nlocal,charge,rot,*gpu))
    return -3;

-  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
-                  _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device, compile_string()))
-    return -3;
-  if (_cell_size<0.0)
-    nbor->cell_size(cell_size,cell_size);
-  else
-    nbor->cell_size(_cell_size,cell_size);
-
  _init_count++;
  return 0;
 }
@ -338,6 +325,39 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
  return 0;
 }

+template <class numtyp, class acctyp>
+int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
+                  const int host_nlocal, const int nall,
+                  const int maxspecial, const int gpu_host,
+                  const int max_nbors, const double cell_size,
+                  const bool pre_cut, const int threads_per_atom) {
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+ 
+  int gpu_nbor=0;
+  if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+  #ifndef USE_CUDPP
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
+
+  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
+                  _block_cell_id, _block_nbor_build, threads_per_atom,
+                  _warp_size, _time_device, compile_string()))
+    return -3;
+  if (_cell_size<0.0)
+    nbor->cell_size(cell_size,cell_size);
+  else
+    nbor->cell_size(_cell_size,cell_size);
+
+  return 0;
+}
+
 template <class numtyp, class acctyp>
 void DeviceT::set_single_precompute
                     (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
@ -614,7 +634,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
    if (screen && times[6]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"    Device Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average) for kspace: ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -53,11 +53,43 @@ class Device {
                  const int t_per_atom, const double cell_size,
                  char *vendor_string, const int block_pair);

-  /// Initialize the device for Atom and Neighbor storage
-  /** \param rot True if quaternions need to be stored
+  /// Initialize the device for Atom storage
+  /** \param charge True if charges need to be stored 
+    * \param rot True if quaternions need to be stored
+    * \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
+    * \param vel True if velocities need to be stored
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
+           const int nlocal, const int nall, const int maxspecial,
+           const bool vel=false);
+
+  /// Initialize the device for Atom storage only
+  /** \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
+
+  /// Initialize the neighbor list storage
+  /** \param charge True if charges need to be stored
+    * \param rot True if quaternions need to be stored
    * \param nlocal Total number of local particles to allocate memory for
    * \param host_nlocal Initial number of host particles to allocate memory for
    * \param nall Total number of local+ghost particles
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
    * \param gpu_host 0 if host will not perform force calculations,
    *                 1 if gpu_nbor is true, and host needs a half nbor list,
    *                 2 if gpu_nbor is true, and host needs a full nbor list
@ -73,23 +105,11 @@ class Device {
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
-  int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot,
-           const int nlocal, const int host_nlocal, const int nall,
-           Neighbor *nbor, const int maxspecial, const int gpu_host,
-           const int max_nbors, const double cell_size, const bool pre_cut,
-           const int threads_per_atom, const bool vel=false);
-
-  /// Initialize the device for Atom storage only
-  /** \param nlocal Total number of local particles to allocate memory for
-    * \param nall Total number of local+ghost particles
-    *
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
+  int init_nbor(Neighbor *nbor, const int nlocal,
+                const int host_nlocal, const int nall,
+                const int maxspecial, const int gpu_host,
+                const int max_nbors, const double cell_size,
+                const bool pre_cut, const int threads_per_atom);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
@ -173,7 +193,7 @@ class Device {
  /// Return host memory usage in bytes
  double host_memory_usage() const;

-  /// Return the number of procs sharing a device (size of device commincator)
+  /// Return the number of procs sharing a device (size of device communicator)
  inline int procs_per_gpu() const { return _procs_per_gpu; }
  /// Return the number of threads per proc
  inline int num_threads() const { return _nthreads; }
@ -260,12 +280,12 @@ class Device {
  /// Atom Data
  Atom<numtyp,acctyp> atom;

-  // --------------------------- NBOR DATA ----------------------------
+  // --------------------------- NBOR SHARED KERNELS ----------------

-  /// Neighbor Data
+  /// Shared kernels for neighbor lists
  NeighborShared _neighbor_shared;

-  // ------------------------ LONG RANGE DATA -------------------------
+  // ------------------------ LONG RANGE DATA -----------------------

  // Long Range Data
  int _long_range_precompute;
--- a/potentials/Ta06A.snapparam
+++ b/potentials/Ta06A.snapparam
@ -10,6 +10,5 @@ twojmax 6

 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
--- a/potentials/W_2940_2017_2.snapparam
+++ b/potentials/W_2940_2017_2.snapparam
@ -8,6 +8,5 @@ twojmax 8

 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
--- a/src/.gitignore
+++ b/src/.gitignore
@ -167,6 +167,10 @@
 /pair_spin.h
 /pair_spin_dmi.cpp
 /pair_spin_dmi.h
+/pair_spin_dipole_cut.cpp
+/pair_spin_dipole_cut.h
+/pair_spin_dipole_long.cpp
+/pair_spin_dipole_long.h
 /pair_spin_exchange.cpp
 /pair_spin_exchange.h
 /pair_spin_magelec.cpp
@ -428,6 +432,10 @@
 /ewald.h
 /ewald_cg.cpp
 /ewald_cg.h
+/ewald_dipole.cpp
+/ewald_dipole.h
+/ewald_dipole_spin.cpp
+/ewald_dipole_spin.h
 /ewald_disp.cpp
 /ewald_disp.h
 /ewald_n.cpp
@ -1027,6 +1035,10 @@
 /pppm.h
 /pppm_cg.cpp
 /pppm_cg.h
+/pppm_dipole.cpp
+/pppm_dipole.h
+/pppm_dipole_spin.cpp
+/pppm_dipole_spin.h
 /pppm_disp.cpp
 /pppm_disp.h
 /pppm_disp_tip4p.cpp
--- a/src/CLASS2/pair_lj_class2.cpp
+++ b/src/CLASS2/pair_lj_class2.cpp
@ -2,12 +2,10 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
-
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
-
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

@ -19,7 +17,12 @@
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
+#include "neighbor.h"
 #include "neigh_list.h"
+#include "neigh_request.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
@ -31,6 +34,7 @@ using namespace MathConst;

 PairLJClass2::PairLJClass2(LAMMPS *lmp) : Pair(lmp)
 {
+  respa_enable = 1;
  writedata = 1;
 }

@ -133,6 +137,270 @@ void PairLJClass2::compute(int eflag, int vflag)
  if (vflag_fdotr) virial_fdotr_compute();
 }

+/* ----------------------------------------------------------------------
+*/
+
+void PairLJClass2::compute_inner()
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fpair;
+  double rsq,rinv,r2inv,r3inv,r6inv,forcelj,factor_lj,rsw;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum_inner;
+  ilist = list->ilist_inner;
+  numneigh = list->numneigh_inner;
+  firstneigh = list->firstneigh_inner;
+
+  double cut_out_on = cut_respa[0];
+  double cut_out_off = cut_respa[1];
+
+  double cut_out_diff = cut_out_off - cut_out_on;
+  double cut_out_on_sq = cut_out_on*cut_out_on;
+  double cut_out_off_sq = cut_out_off*cut_out_off;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_out_off_sq) {
+        r2inv = 1.0/rsq;
+		rinv = sqrt(r2inv);
+		r3inv = r2inv*rinv;
+        r6inv = r3inv*r3inv;
+        jtype = type[j];
+        forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+        if (rsq > cut_out_on_sq) {
+          rsw = (sqrt(rsq) - cut_out_on)/cut_out_diff;
+          fpair *= 1.0 - rsw*rsw*(3.0 - 2.0*rsw);
+        }
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2::compute_middle()
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fpair;
+  double rsq,rinv,r2inv,r3inv,r6inv,forcelj,factor_lj,rsw;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum_middle;
+  ilist = list->ilist_middle;
+  numneigh = list->numneigh_middle;
+  firstneigh = list->firstneigh_middle;
+
+  double cut_in_off = cut_respa[0];
+  double cut_in_on = cut_respa[1];
+  double cut_out_on = cut_respa[2];
+  double cut_out_off = cut_respa[3];
+
+  double cut_in_diff = cut_in_on - cut_in_off;
+  double cut_out_diff = cut_out_off - cut_out_on;
+  double cut_in_off_sq = cut_in_off*cut_in_off;
+  double cut_in_on_sq = cut_in_on*cut_in_on;
+  double cut_out_on_sq = cut_out_on*cut_out_on;
+  double cut_out_off_sq = cut_out_off*cut_out_off;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_out_off_sq && rsq > cut_in_off_sq) {
+        r2inv = 1.0/rsq;
+		rinv = sqrt(r2inv);
+		r3inv = r2inv*rinv;
+        r6inv = r3inv*r3inv;
+        jtype = type[j];
+        forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+        if (rsq < cut_in_on_sq) {
+          rsw = (sqrt(rsq) - cut_in_off)/cut_in_diff;
+          fpair *= rsw*rsw*(3.0 - 2.0*rsw);
+        }
+        if (rsq > cut_out_on_sq) {
+          rsw = (sqrt(rsq) - cut_out_on)/cut_out_diff;
+          fpair *= 1.0 + rsw*rsw*(2.0*rsw - 3.0);
+        }
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2::compute_outer(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,rinv,r2inv,r3inv,r6inv,forcelj,factor_lj,rsw;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+  ev_init(eflag,vflag);
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  double cut_in_off = cut_respa[2];
+  double cut_in_on = cut_respa[3];
+
+  double cut_in_diff = cut_in_on - cut_in_off;
+  double cut_in_off_sq = cut_in_off*cut_in_off;
+  double cut_in_on_sq = cut_in_on*cut_in_on;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        if (rsq > cut_in_off_sq) {
+          r2inv = 1.0/rsq;
+		  rinv = sqrt(r2inv);
+		  r3inv = r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+          fpair = factor_lj*forcelj*r2inv;
+          if (rsq < cut_in_on_sq) {
+            rsw = (sqrt(rsq) - cut_in_off)/cut_in_diff;
+            fpair *= rsw*rsw*(3.0 - 2.0*rsw);
+          }
+
+          f[i][0] += delx*fpair;
+          f[i][1] += dely*fpair;
+          f[i][2] += delz*fpair;
+          if (newton_pair || j < nlocal) {
+            f[j][0] -= delx*fpair;
+            f[j][1] -= dely*fpair;
+            f[j][2] -= delz*fpair;
+          }
+        }
+
+        if (eflag) {
+          r2inv = 1.0/rsq;
+		  rinv = sqrt(r2inv);
+		  r3inv = r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
+            offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (vflag) {
+          if (rsq <= cut_in_off_sq) {
+            r2inv = 1.0/rsq;
+			rinv = sqrt(r2inv);
+			r3inv = r2inv*rinv;
+            r6inv = r3inv*r3inv;
+            forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+            fpair = factor_lj*forcelj*r2inv;
+          } else if (rsq < cut_in_on_sq)
+            fpair = factor_lj*forcelj*r2inv;
+        }
+
+        if (evflag) ev_tally(i,j,nlocal,newton_pair,
+                             evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
 /* ----------------------------------------------------------------------
   allocate all arrays
 ------------------------------------------------------------------------- */
@ -212,6 +480,38 @@ void PairLJClass2::coeff(int narg, char **arg)
  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }

+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJClass2::init_style()
+{
+  // request regular or rRESPA neighbor list
+
+  int irequest;
+  int respa = 0;
+
+  if (update->whichflag == 1 && strstr(update->integrate_style,"respa")) {
+    if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
+    if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
+  }
+
+  irequest = neighbor->request(this,instance_me);
+
+  if (respa >= 1) {
+    neighbor->requests[irequest]->respaouter = 1;
+    neighbor->requests[irequest]->respainner = 1;
+  }
+  if (respa == 2) neighbor->requests[irequest]->respamiddle = 1;
+
+  // set rRESPA cutoffs
+
+  if (strstr(update->integrate_style,"respa") &&
+      ((Respa *) update->integrate)->level_inner >= 0)
+    cut_respa = ((Respa *) update->integrate)->cutoff;
+  else cut_respa = NULL;
+}
+
 /* ----------------------------------------------------------------------
   init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
@ -246,6 +546,11 @@ double PairLJClass2::init_one(int i, int j)
  lj4[j][i] = lj4[i][j];
  offset[j][i] = offset[i][j];

+  // check interior rRESPA cutoff
+
+  if (cut_respa && cut[i][j] < cut_respa[3])
+    error->all(FLERR,"Pair cutoff < Respa interior cutoff");
+
  // compute I,J contribution to long-range tail correction
  // count total # of atoms of type I and J via Allreduce

--- a/src/CLASS2/pair_lj_class2.h
+++ b/src/CLASS2/pair_lj_class2.h
@ -2,12 +2,10 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
-
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
-
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

@ -31,6 +29,7 @@ class PairLJClass2 : public Pair {
  virtual void compute(int, int);
  virtual void settings(int, char **);
  void coeff(int, char **);
+  void init_style();
  virtual double init_one(int, int);
  void write_restart(FILE *);
  void read_restart(FILE *);
@ -41,11 +40,16 @@ class PairLJClass2 : public Pair {
  double single(int, int, int, int, double, double, double, double &);
  void *extract(const char *, int &);

+  void compute_inner();
+  void compute_middle();
+  void compute_outer(int, int);
+
 protected:
  double cut_global;
  double **cut;
  double **epsilon,**sigma;
  double **lj1,**lj2,**lj3,**lj4,**offset;
+  double *cut_respa;

  virtual void allocate();
 };
@ -56,15 +60,13 @@ class PairLJClass2 : public Pair {
 #endif

 /* ERROR/WARNING messages:
-
 E: Illegal ... command
-
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
-
 E: Incorrect args for pair coefficients
-
 Self-explanatory.  Check the input script or data file.
-
+E: Pair cutoff < Respa interior cutoff
+One or more pairwise cutoffs are too short to use with the specified
+rRESPA cutoffs.
 */
--- a/src/DIPOLE/pair_lj_cut_dipole_long.cpp
+++ b/src/DIPOLE/pair_lj_cut_dipole_long.cpp
@ -44,7 +44,7 @@ using namespace MathConst;
 PairLJCutDipoleLong::PairLJCutDipoleLong(LAMMPS *lmp) : Pair(lmp)
 {
  single_enable = 0;
-  ewaldflag = dipoleflag = 1;
+  ewaldflag = pppmflag = dipoleflag = 1;
  respa_enable = 0;
 }

--- a/src/GPU/README
+++ b/src/GPU/README
@ -1,9 +1,8 @@
 This package implements GPU optimizations of various LAMMPS styles.

-Section 5.3.1 on the manual gives details of what hardware and Cuda
+Section 3.7 of the manual gives details of what hardware and Cuda
 software is required on your system, and full details on how to build
-and use this package.  See the KOKKOS package, which also has
-GPU-enabled styles.
+and use this package.  The KOKKOS package also has GPU-enabled styles.

 This package uses an external library provided in lib/gpu which must
 be compiled before making LAMMPS.  See the lib/gpu/README file and the
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -219,17 +219,6 @@ void FixGPU::init()
    error->all(FLERR,"GPU package does not (yet) work with "
               "atom_style template");

-  // hybrid cannot be used with force/neigh option
-
-  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
-
-  if (_particle_split < 0)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"GPU split param must be positive "
-                 "for hybrid pair styles");
-
  // neighbor list builds on the GPU with triclinic box is not yet supported

  if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) &&
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@ -65,14 +65,6 @@ E: GPU package does not (yet) work with atom_style template

 Self-explanatory.

-E: Cannot use pair hybrid with GPU neighbor list builds
-
-Neighbor list builds must be done on the CPU for this pair style.
-
-E: GPU split param must be positive for hybrid pair styles
-
-See the package gpu command.
-
 E: Cannot use package gpu neigh yes with triclinic box

 This is a current restriction in LAMMPS.
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -85,9 +85,6 @@ void PairSNAPKokkos<DeviceType>::init_style()
  if (force->newton_pair == 0)
    error->all(FLERR,"Pair style SNAP requires newton pair on");

-  if (diagonalstyle != 3)
-    error->all(FLERR,"Must use diagonal style = 3 with pair snap/kk");
-
  // irequest = neigh request made by parent class

  neighflag = lmp->kokkos->neighflag;
@ -343,23 +340,12 @@ void PairSNAPKokkos<DeviceType>::coeff(int narg, char **arg)
  Kokkos::deep_copy(d_coeffelem,h_coeffelem);
  Kokkos::deep_copy(d_map,h_map);

-  // deallocate non-kokkos sna
-
-  if (sna) {
-    for (int tid = 0; tid<nthreads; tid++)
-      delete sna[tid];
-    delete [] sna;
-    sna = NULL;
-  }
-
  // allocate memory for per OpenMP thread data which
  // is wrapped into the sna class

  snaKK = SNAKokkos<DeviceType>(rfac0,twojmax,
-                  diagonalstyle,use_shared_arrays,
                  rmin0,switchflag,bzeroflag);
-    //if (!use_shared_arrays)
-  snaKK.grow_rij(nmax);
+  snaKK.grow_rij(0);
  snaKK.init();
 }

@ -667,8 +653,6 @@ double PairSNAPKokkos<DeviceType>::memory_usage()
  int n = atom->ntypes+1;
  bytes += n*n*sizeof(int);
  bytes += n*n*sizeof(double);
-  bytes += 3*nmax*sizeof(double);
-  bytes += nmax*sizeof(int);
  bytes += (2*ncoeffall)*sizeof(double);
  bytes += (ncoeff*3)*sizeof(double);
  bytes += snaKK.memory_usage();
--- a/src/KOKKOS/sna_kokkos.h
+++ b/src/KOKKOS/sna_kokkos.h
@ -48,7 +48,7 @@ inline
  SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team);

 inline
-  SNAKokkos(double, int, int, int, double, int, int);
+  SNAKokkos(double, int, double, int, int);

  KOKKOS_INLINE_FUNCTION
  ~SNAKokkos();
@ -178,12 +178,6 @@ inline
                       double, double, double, // compute_duidrj
                       double, double, double, double, double);

-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
-
  // Sets the style for the switching function
  // 0 = none
  // 1 = cosine
--- a/src/KOKKOS/sna_kokkos_impl.h
+++ b/src/KOKKOS/sna_kokkos_impl.h
@ -27,19 +27,17 @@ static const double MY_PI  = 3.14159265358979323846; // pi
 template<class DeviceType>
 inline
 SNAKokkos<DeviceType>::SNAKokkos(double rfac0_in,
-         int twojmax_in, int diagonalstyle_in, int use_shared_arrays_in,
+         int twojmax_in,
         double rmin0_in, int switch_flag_in, int bzero_flag_in)
 {
  wself = 1.0;

-  use_shared_arrays = use_shared_arrays_in;
  rfac0 = rfac0_in;
  rmin0 = rmin0_in;
  switch_flag = switch_flag_in;
  bzero_flag = bzero_flag_in;

  twojmax = twojmax_in;
-  diagonalstyle = diagonalstyle_in;

  ncoeff = compute_ncoeff();

@ -70,14 +68,12 @@ KOKKOS_INLINE_FUNCTION
 SNAKokkos<DeviceType>::SNAKokkos(const SNAKokkos<DeviceType>& sna, const typename Kokkos::TeamPolicy<DeviceType>::member_type& team) {
  wself = sna.wself;

-  use_shared_arrays = sna.use_shared_arrays;
  rfac0 = sna.rfac0;
  rmin0 = sna.rmin0;
  switch_flag = sna.switch_flag;
  bzero_flag = sna.bzero_flag;

  twojmax = sna.twojmax;
-  diagonalstyle = sna.diagonalstyle;

  ncoeff = sna.ncoeff;
  nmax = sna.nmax;
@ -104,48 +100,45 @@ template<class DeviceType>
 inline
 void SNAKokkos<DeviceType>::build_indexlist()
 {
-  if(diagonalstyle == 3) {
-    int idxj_count = 0;
-    int idxj_full_count = 0;
+  int idxj_count = 0;
+  int idxj_full_count = 0;

-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) idxj_count++;
-          idxj_full_count++;
-        }
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+	if (j >= j1) idxj_count++;
+	idxj_full_count++;
+      }

-    // indexList can be changed here
+  // indexList can be changed here

-    idxj = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj",idxj_count);
-    idxj_full = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj_full",idxj_full_count);
-    auto h_idxj = Kokkos::create_mirror_view(idxj);
-    auto h_idxj_full = Kokkos::create_mirror_view(idxj_full);
+  idxj = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj",idxj_count);
+  idxj_full = Kokkos::View<SNAKK_LOOPINDICES*, DeviceType>("SNAKokkos::idxj_full",idxj_full_count);
+  auto h_idxj = Kokkos::create_mirror_view(idxj);
+  auto h_idxj_full = Kokkos::create_mirror_view(idxj_full);

-    idxj_max = idxj_count;
-    idxj_full_max = idxj_full_count;
+  idxj_max = idxj_count;
+  idxj_full_max = idxj_full_count;

-    idxj_count = 0;
-    idxj_full_count = 0;
+  idxj_count = 0;
+  idxj_full_count = 0;

-    for(int j1 = 0; j1 <= twojmax; j1++)
-      for(int j2 = 0; j2 <= j1; j2++)
-        for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
-          if (j >= j1) {
-            h_idxj[idxj_count].j1 = j1;
-            h_idxj[idxj_count].j2 = j2;
-            h_idxj[idxj_count].j = j;
-            idxj_count++;
-          }
-          h_idxj_full[idxj_full_count].j1 = j1;
-          h_idxj_full[idxj_full_count].j2 = j2;
-          h_idxj_full[idxj_full_count].j = j;
-          idxj_full_count++;
-        }
-    Kokkos::deep_copy(idxj,h_idxj);
-    Kokkos::deep_copy(idxj_full,h_idxj_full);
-
-  }
+  for(int j1 = 0; j1 <= twojmax; j1++)
+    for(int j2 = 0; j2 <= j1; j2++)
+      for(int j = abs(j1 - j2); j <= MIN(twojmax, j1 + j2); j += 2) {
+	if (j >= j1) {
+	  h_idxj[idxj_count].j1 = j1;
+	  h_idxj[idxj_count].j2 = j2;
+	  h_idxj[idxj_count].j = j;
+	  idxj_count++;
+	}
+	h_idxj_full[idxj_full_count].j1 = j1;
+	h_idxj_full[idxj_full_count].j2 = j2;
+	h_idxj_full[idxj_full_count].j = j;
+	idxj_full_count++;
+      }
+  Kokkos::deep_copy(idxj,h_idxj);
+  Kokkos::deep_copy(idxj_full,h_idxj_full);

 }
 /* ---------------------------------------------------------------------- */
@ -1223,26 +1216,10 @@ int SNAKokkos<DeviceType>::compute_ncoeff()
  ncount = 0;

  for (int j1 = 0; j1 <= twojmax; j1++)
-    if(diagonalstyle == 0) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          ncount++;
-    } else if(diagonalstyle == 1) {
-      int j2 = j1;
-
+    for (int j2 = 0; j2 <= j1; j2++)
      for (int j = abs(j1 - j2);
-          j <= MIN(twojmax, j1 + j2); j += 2)
-        ncount++;
-    } else if(diagonalstyle == 2) {
-      ncount++;
-    } else if(diagonalstyle == 3) {
-      for (int j2 = 0; j2 <= j1; j2++)
-        for (int j = abs(j1 - j2);
-            j <= MIN(twojmax, j1 + j2); j += 2)
-          if (j >= j1) ncount++;
-    }
-
+	   j <= MIN(twojmax, j1 + j2); j += 2)
+	if (j >= j1) ncount++;
  return ncount;
 }

--- a/src/KSPACE/ewald_dipole.cpp
+++ b/src/KSPACE/ewald_dipole.cpp
@ -0,0 +1,920 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL), Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "ewald_dipole.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "pair.h"
+#include "domain.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define SMALL 0.00001
+
+/* ---------------------------------------------------------------------- */
+
+EwaldDipole::EwaldDipole(LAMMPS *lmp) : Ewald(lmp),
+  tk(NULL), vc(NULL)
+{
+  ewaldflag = dipoleflag = 1;
+  group_group_enable = 0;
+  tk = NULL;
+  vc = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+EwaldDipole::~EwaldDipole()
+{
+  memory->destroy(tk);
+  memory->destroy(vc);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void EwaldDipole::init()
+{
+  if (comm->me == 0) {
+    if (screen) fprintf(screen,"EwaldDipole initialization ...\n");
+    if (logfile) fprintf(logfile,"EwaldDipole initialization ...\n");
+  }
+
+  // error check
+  
+  dipoleflag = atom->mu?1:0;
+  qsum_qsq(0); // q[i] might not be declared ?
+
+  if (dipoleflag && q2)
+    error->all(FLERR,"Cannot (yet) use charges with Kspace style EwaldDipole");
+
+  triclinic_check();
+  
+  // no triclinic ewald dipole (yet)
+  
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot (yet) use EwaldDipole with triclinic box");
+  
+  if (domain->dimension == 2)
+    error->all(FLERR,"Cannot use EwaldDipole with 2d simulation");
+
+  if (!atom->mu) error->all(FLERR,"Kspace style requires atom attribute mu");
+
+  if (dipoleflag && strcmp(update->unit_style,"electron") == 0)
+    error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
+  
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with EwaldDipole");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab EwaldDipole");
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot yet use triclinic cells with EwaldDipole");
+
+  pair_check();
+
+  int itmp;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  double cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+  // qdist = offset only for TIP4P fictitious charge
+
+  //qdist = 0.0;
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with EwaldDipole");
+
+  // compute musum & musqsum and warn if no dipole
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  musum_musq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // setup K-space resolution
+
+  bigint natoms = atom->natoms;
+
+  // use xprd,yprd,zprd even if triclinic so grid size is the same
+  // adjust z dimension for 2d slab EwaldDipole
+  // 3d EwaldDipole just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    
+    // initial guess with old method
+    
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*mu2);
+    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
+    
+    // try Newton solver
+
+    double g_ewald_new =
+      NewtonSolve(g_ewald,cutoff,natoms,xprd*yprd*zprd,mu2);
+    if (g_ewald_new > 0.0) g_ewald = g_ewald_new;
+    else error->warning(FLERR,"Ewald/disp Newton solver failed, "
+                        "using old method to estimate g_ewald");  
+  }
+
+  // setup EwaldDipole coefficients so can print stats
+
+  setup();
+
+  // final RMS accuracy
+
+  double lprx = rms(kxmax_orig,xprd,natoms,q2);
+  double lpry = rms(kymax_orig,yprd,natoms,q2);
+  double lprz = rms(kzmax_orig,zprd_slab,natoms,q2);
+  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
+  double spr = 2.0 *q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
+  double tpr = estimate_table_accuracy(q2_over_sqrt,spr);
+  double estimated_accuracy = sqrt(lpr*lpr + spr*spr + tpr*tpr);
+
+  // stats
+
+  if (comm->me == 0) {
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(screen,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(logfile,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   adjust EwaldDipole coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void EwaldDipole::setup()
+{
+  // volume-dependent factors
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+
+  // adjustment of z dimension for 2d slab EwaldDipole
+  // 3d EwaldDipole just uses zprd since slab_volfactor = 1.0
+
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  unitk[0] = 2.0*MY_PI/xprd;
+  unitk[1] = 2.0*MY_PI/yprd;
+  unitk[2] = 2.0*MY_PI/zprd_slab;
+
+  int kmax_old = kmax;
+
+  if (kewaldflag == 0) {
+
+    // determine kmax
+    // function of current box size, accuracy, G_ewald (short-range cutoff)
+
+    bigint natoms = atom->natoms;
+    double err;
+    kxmax = 1;
+    kymax = 1;
+    kzmax = 1;	
+
+    // set kmax in 3 directions to respect accuracy
+
+    err = rms_dipole(kxmax,xprd,natoms);
+    while (err > accuracy) {
+      kxmax++;
+      err = rms_dipole(kxmax,xprd,natoms);
+    }
+
+    err = rms_dipole(kymax,yprd,natoms);
+    while (err > accuracy) {
+      kymax++;
+      err = rms_dipole(kymax,yprd,natoms);
+    }
+
+    err = rms_dipole(kzmax,zprd,natoms);
+    while (err > accuracy) {
+      kzmax++;
+      err = rms_dipole(kzmax,zprd,natoms);
+    }
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+  } else {
+
+    kxmax = kx_ewald;
+    kymax = ky_ewald;
+    kzmax = kz_ewald;
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+  }
+
+  gsqmx *= 1.00001;
+
+  // if size has grown, reallocate k-dependent and nlocal-dependent arrays
+
+  if (kmax > kmax_old) {
+    deallocate();
+    allocate();
+    group_allocate_flag = 0;
+
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole:ek");
+    memory->create(tk,nmax,3,"ewald_dipole:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole:sn");
+    kmax_created = kmax;
+  }
+
+  // pre-compute EwaldDipole coefficients
+
+  coeffs();
+}
+
+/* ----------------------------------------------------------------------
+   compute dipole RMS accuracy for a dimension
+------------------------------------------------------------------------- */
+
+double EwaldDipole::rms_dipole(int km, double prd, bigint natoms)
+{
+  if (natoms == 0) natoms = 1;   // avoid division by zero
+
+  // error from eq.(46), Wang et al., JCP 115, 6351 (2001)
+  
+  double value = 8*MY_PI*mu2*g_ewald/volume *
+    sqrt(2*MY_PI*km*km*km/(15.0*natoms)) *
+    exp(-MY_PI*MY_PI*km*km/(g_ewald*g_ewald*prd*prd));
+
+  return value;
+}
+
+/* ----------------------------------------------------------------------
+   compute the EwaldDipole long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void EwaldDipole::compute(int eflag, int vflag)
+{
+  int i,j,k;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+
+  // set energy/virial flags
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    musum_musq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no charges
+
+  if (musqsum == 0.0) return;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole:ek");
+    memory->create(tk,nmax,3,"ewald_dipole:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole:sn");
+    kmax_created = kmax;
+  }
+
+  // partial structure factors on each processor
+  // total structure factor by summing over procs
+
+  eik_dot_r();
+
+  MPI_Allreduce(sfacrl,sfacrl_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(sfacim,sfacim_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+
+  // K-space portion of electric field
+  // double loop over K-vectors and local atoms
+  // perform per-atom calculations if needed
+
+  double **f = atom->f;
+  double **t = atom->torque;
+  double **mu = atom->mu;
+  int nlocal = atom->nlocal;
+
+  int kx,ky,kz;
+  double cypz,sypz,exprl,expim;
+  double partial,partial2,partial_peratom;
+  double vcik[6];
+  double mudotk;
+
+  for (i = 0; i < nlocal; i++) {
+    ek[i][0] = ek[i][1] = ek[i][2] = 0.0;
+    tk[i][0] = tk[i][1] = tk[i][2] = 0.0;
+  }
+
+  for (k = 0; k < kcount; k++) {
+    kx = kxvecs[k];
+    ky = kyvecs[k];
+    kz = kzvecs[k];
+    for (j = 0; j<6; j++) vc[k][j] = 0.0;
+
+    for (i = 0; i < nlocal; i++) {
+
+      for (j = 0; j<6; j++) vcik[j] = 0.0;
+      
+      // re-evaluating mu dot k
+      mudotk = mu[i][0]*kx*unitk[0] + mu[i][1]*ky*unitk[1] + mu[i][2]*kz*unitk[2];
+
+      // calculating  re and im of exp(i*k*ri)
+
+      cypz = cs[ky][1][i]*cs[kz][2][i] - sn[ky][1][i]*sn[kz][2][i];
+      sypz = sn[ky][1][i]*cs[kz][2][i] + cs[ky][1][i]*sn[kz][2][i];
+      exprl = cs[kx][0][i]*cypz - sn[kx][0][i]*sypz;
+      expim = sn[kx][0][i]*cypz + cs[kx][0][i]*sypz;
+
+      // taking im of struct_fact x exp(i*k*ri) (for force calc.)
+
+      partial = (mudotk)*(expim*sfacrl_all[k] - exprl*sfacim_all[k]);
+      ek[i][0] += partial * eg[k][0];
+      ek[i][1] += partial * eg[k][1];
+      ek[i][2] += partial * eg[k][2];
+
+      // compute field for torque calculation
+
+      partial_peratom = exprl*sfacrl_all[k] + expim*sfacim_all[k];
+      tk[i][0] += partial_peratom * eg[k][0];
+      tk[i][1] += partial_peratom * eg[k][1];
+      tk[i][2] += partial_peratom * eg[k][2];
+
+      // total and per-atom virial correction (dipole only)
+
+      vc[k][0] += vcik[0] = -(partial_peratom * mu[i][0] * eg[k][0]);
+      vc[k][1] += vcik[1] = -(partial_peratom * mu[i][1] * eg[k][1]);
+      vc[k][2] += vcik[2] = -(partial_peratom * mu[i][2] * eg[k][2]);
+      vc[k][3] += vcik[3] = -(partial_peratom * mu[i][0] * eg[k][1]);
+      vc[k][4] += vcik[4] = -(partial_peratom * mu[i][0] * eg[k][2]);
+      vc[k][5] += vcik[5] = -(partial_peratom * mu[i][1] * eg[k][2]);
+      
+      // taking re-part of struct_fact x exp(i*k*ri) 
+      // (for per-atom energy and virial calc.)
+
+      if (evflag_atom) {
+        if (eflag_atom) eatom[i] += mudotk*ug[k]*partial_peratom;
+        if (vflag_atom)
+          for (j = 0; j < 6; j++)
+	    vatom[i][j] += (ug[k]*mudotk*vg[k][j]*partial_peratom - vcik[j]);
+      }
+    }
+  }
+
+  // force and torque calculation
+
+  const double muscale = qqrd2e * scale;
+
+  for (i = 0; i < nlocal; i++) {
+    f[i][0] += muscale * ek[i][0];
+    f[i][1] += muscale * ek[i][1];
+    if (slabflag != 2) f[i][2] += muscale * ek[i][2];
+    t[i][0] -= muscale * (mu[i][1]*tk[i][2] - mu[i][2]*tk[i][1]);
+    t[i][1] -= muscale * (mu[i][2]*tk[i][0] - mu[i][0]*tk[i][2]);
+    if (slabflag != 2) t[i][2] -= muscale * (mu[i][0]*tk[i][1] - mu[i][1]*tk[i][0]);
+  }
+
+  // sum global energy across Kspace vevs and add in volume-dependent term
+  // taking the re-part of struct_fact_i x struct_fact_j
+  // substracting self energy and scaling
+
+  if (eflag_global) {
+    for (k = 0; k < kcount; k++) {
+      energy += ug[k] * (sfacrl_all[k]*sfacrl_all[k] +
+                         sfacim_all[k]*sfacim_all[k]);
+    }
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= muscale;
+  }
+
+  // global virial
+
+  if (vflag_global) {
+    double uk, vk;
+    for (k = 0; k < kcount; k++) {
+      uk = ug[k] * (sfacrl_all[k]*sfacrl_all[k] + sfacim_all[k]*sfacim_all[k]);
+      for (j = 0; j < 6; j++) virial[j] += uk*vg[k][j] - vc[k][j];
+    }
+    for (j = 0; j < 6; j++) virial[j] *= muscale;
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2])
+	  *2.0*g3/3.0/MY_PIS;
+        eatom[i] *= muscale;
+      }
+    }
+
+    if (vflag_atom)
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= muscale;
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   compute the struc. factors and mu dot k products
+------------------------------------------------------------------------- */
+
+void EwaldDipole::eik_dot_r()
+{
+  int i,k,l,m,n,ic;
+  double cstr1,sstr1,cstr2,sstr2,cstr3,sstr3,cstr4,sstr4;
+  double sqk,clpm,slpm;
+  double mux, muy, muz;
+  double mudotk;
+
+  double **x = atom->x;
+  double **mu = atom->mu;
+  int nlocal = atom->nlocal;
+
+  n = 0;
+  mux = muy = muz = 0.0;
+
+  // loop on different k-directions
+  // loop on n kpoints and nlocal atoms
+
+  // (k,0,0), (0,l,0), (0,0,m)
+  
+  // loop 1: k=1, l=1, m=1
+  // define first val. of cos and sin
+
+  for (ic = 0; ic < 3; ic++) {
+    sqk = unitk[ic]*unitk[ic];
+    if (sqk <= gsqmx) {
+      cstr1 = 0.0;
+      sstr1 = 0.0;
+      for (i = 0; i < nlocal; i++) {
+        cs[0][ic][i] = 1.0;
+        sn[0][ic][i] = 0.0;
+        cs[1][ic][i] = cos(unitk[ic]*x[i][ic]);
+        sn[1][ic][i] = sin(unitk[ic]*x[i][ic]);
+        cs[-1][ic][i] = cs[1][ic][i];
+        sn[-1][ic][i] = -sn[1][ic][i];
+        mudotk = (mu[i][ic]*unitk[ic]);
+        cstr1 += mudotk*cs[1][ic][i];
+        sstr1 += mudotk*sn[1][ic][i];
+      }
+      sfacrl[n] = cstr1;
+      sfacim[n++] = sstr1;
+    }
+  }
+
+  // loop 2: k>1, l>1, m>1
+
+  for (m = 2; m <= kmax; m++) {
+    for (ic = 0; ic < 3; ic++) {
+      sqk = m*unitk[ic] * m*unitk[ic];
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+          cs[m][ic][i] = cs[m-1][ic][i]*cs[1][ic][i] -
+            sn[m-1][ic][i]*sn[1][ic][i];
+          sn[m][ic][i] = sn[m-1][ic][i]*cs[1][ic][i] +
+            cs[m-1][ic][i]*sn[1][ic][i];
+          cs[-m][ic][i] = cs[m][ic][i];
+          sn[-m][ic][i] = -sn[m][ic][i];
+	  mudotk = (mu[i][ic]*m*unitk[ic]);
+          cstr1 += mudotk*cs[m][ic][i];
+          sstr1 += mudotk*sn[m][ic][i];
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+      }
+    }
+  }
+
+  // 1 = (k,l,0), 2 = (k,-l,0)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  mux = mu[i][0];
+	  muy = mu[i][1];
+
+	  // dir 1: (k,l,0)
+	  mudotk = (mux*k*unitk[0] + muy*l*unitk[1]);
+          cstr1 += mudotk*(cs[k][0][i]*cs[l][1][i]-sn[k][0][i]*sn[l][1][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[l][1][i]+cs[k][0][i]*sn[l][1][i]);
+	  
+	  // dir 2: (k,-l,0)
+	  mudotk = (mux*k*unitk[0] - muy*l*unitk[1]);
+          cstr2 += mudotk*(cs[k][0][i]*cs[l][1][i]+sn[k][0][i]*sn[l][1][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[l][1][i]-cs[k][0][i]*sn[l][1][i]);
+	}
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (0,l,m), 2 = (0,l,-m)
+
+  for (l = 1; l <= kymax; l++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (l*unitk[1] * l*unitk[1]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  muy = mu[i][1];
+	  muz = mu[i][2];
+
+	  // dir 1: (0,l,m)
+      	  mudotk = (muy*l*unitk[1] + muz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i]);
+	  
+	  // dir 2: (0,l,-m)
+	  mudotk = (muy*l*unitk[1] - muz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[l][1][i]*cs[m][2][i]+sn[l][1][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[l][1][i]*cs[m][2][i]-cs[l][1][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,0,m), 2 = (k,0,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+      	  mux = mu[i][0];
+	  muz = mu[i][2];
+
+	  // dir 1: (k,0,m)
+	  mudotk = (mux*k*unitk[0] + muz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[k][0][i]*cs[m][2][i]-sn[k][0][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[m][2][i]+cs[k][0][i]*sn[m][2][i]);
+	  
+	  // dir 2: (k,0,-m)
+	  mudotk = (mux*k*unitk[0] - muz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[k][0][i]*cs[m][2][i]+sn[k][0][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[m][2][i]-cs[k][0][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      for (m = 1; m <= kzmax; m++) {
+        sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]) +
+          (m*unitk[2] * m*unitk[2]);
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          cstr2 = 0.0;
+          sstr2 = 0.0;
+          cstr3 = 0.0;
+          sstr3 = 0.0;
+          cstr4 = 0.0;
+          sstr4 = 0.0;
+          for (i = 0; i < nlocal; i++) {
+      	    mux = mu[i][0];
+	    muy = mu[i][1];
+	    muz = mu[i][2];
+
+	    // dir 1: (k,l,m)
+	    mudotk = (mux*k*unitk[0] + muy*l*unitk[1] + muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr1 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr1 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 2: (k,-l,m)
+	    mudotk = (mux*k*unitk[0] - muy*l*unitk[1] + muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr2 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr2 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 3: (k,l,-m)
+	    mudotk = (mux*k*unitk[0] + muy*l*unitk[1] - muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr3 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr3 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 4: (k,-l,-m)
+	    mudotk = (mux*k*unitk[0] - muy*l*unitk[1] - muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr4 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr4 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+          }
+          sfacrl[n] = cstr1;
+          sfacim[n++] = sstr1;
+          sfacrl[n] = cstr2;
+          sfacim[n++] = sstr2;
+          sfacrl[n] = cstr3;
+          sfacim[n++] = sstr3;
+          sfacrl[n] = cstr4;
+          sfacim[n++] = sstr4;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D EwaldDipole if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void EwaldDipole::slabcorr()
+{
+  // compute local contribution to global dipole moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double dipole = 0.0;
+  double **mu = atom->mu;
+  for (int i = 0; i < nlocal; i++) dipole += mu[i][2];
+
+  // sum local contributions to get global dipole moment
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  if (eflag_atom || fabs(qsum) > SMALL) {
+
+    error->all(FLERR,"Cannot (yet) use kspace slab correction with "
+      "long-range dipoles and non-neutral systems or per-atom energy");
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all/12.0)/volume;
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) energy += qscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = qscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++)
+      eatom[i] += efact * mu[i][2]*dipole_all;
+  }
+
+  // add on torque corrections
+
+  if (atom->torque) {
+    double ffact = qscale * (-4.0*MY_PI/volume);
+    double **mu = atom->mu;
+    double **torque = atom->torque;
+    for (int i = 0; i < nlocal; i++) {
+      torque[i][0] += ffact * dipole_all * mu[i][1];
+      torque[i][1] += -ffact * dipole_all * mu[i][0];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute musum,musqsum,mu2
+   called initially, when particle count changes, when dipoles are changed
+------------------------------------------------------------------------- */
+
+void EwaldDipole::musum_musq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->mu_flag) {
+    double** mu = atom->mu;
+    double musum_local(0.0), musqsum_local(0.0);
+
+    for (int i = 0; i < nlocal; i++) {
+      musum_local += mu[i][0] + mu[i][1] + mu[i][2];
+      musqsum_local += mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2];
+    }
+
+    MPI_Allreduce(&musum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&musqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    mu2 = musqsum * force->qqrd2e;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver EwaldDipole on system with no dipoles");
+}
+
+/* ----------------------------------------------------------------------
+   Newton solver used to find g_ewald for LJ systems
+------------------------------------------------------------------------- */
+
+double EwaldDipole::NewtonSolve(double x, double Rc,
+                              bigint natoms, double vol, double b2)
+{
+  double dx,tol;
+  int maxit;
+
+  maxit = 10000; //Maximum number of iterations
+  tol = 0.00001; //Convergence tolerance
+
+  //Begin algorithm
+
+  for (int i = 0; i < maxit; i++) {
+    dx = f(x,Rc,natoms,vol,b2) / derivf(x,Rc,natoms,vol,b2);
+    x = x - dx; //Update x
+    if (fabs(dx) < tol) return x;
+    if (x < 0 || x != x) // solver failed
+      return -1;
+  }
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+ Calculate f(x)
+ ------------------------------------------------------------------------- */
+
+double EwaldDipole::f(double x, double Rc, bigint natoms, double vol, double b2)
+{
+  double a = Rc*x;
+  double f = 0.0;
+
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  f = (b2/(sqrt(vol*powint(x,4)*powint(Rc,9)*natoms)) *
+    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
+    exp(-rg2)) - accuracy;
+
+  return f;
+}
+
+/* ----------------------------------------------------------------------
+ Calculate numerical derivative f'(x)
+ ------------------------------------------------------------------------- */
+
+double EwaldDipole::derivf(double x, double Rc,
+                         bigint natoms, double vol, double b2)
+{
+  double h = 0.000001;  //Derivative step-size
+  return (f(x + h,Rc,natoms,vol,b2) - f(x,Rc,natoms,vol,b2)) / h;
+}
+
--- a/src/KSPACE/ewald_dipole.h
+++ b/src/KSPACE/ewald_dipole.h
@ -0,0 +1,104 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(ewald/dipole,EwaldDipole)
+
+#else
+
+#ifndef LMP_EWALD_DIPOLE_H
+#define LMP_EWALD_DIPOLE_H
+
+#include "ewald.h"
+
+namespace LAMMPS_NS {
+
+class EwaldDipole : public Ewald {
+ public:
+  EwaldDipole(class LAMMPS *);
+  virtual ~EwaldDipole();
+  void init();
+  void setup();
+  virtual void compute(int, int);
+
+ protected:
+  double musum,musqsum,mu2;
+  double **tk;			// field for torque 
+  double **vc;			// virial per k
+
+  void musum_musq(); 
+  double rms_dipole(int, double, bigint);
+  virtual void eik_dot_r();
+  void slabcorr();
+  double NewtonSolve(double, double, bigint, double, double);
+  double f(double, double, bigint, double, double);
+  double derivf(double, double, bigint, double, double);
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Cannot use EwaldDipole with 2d simulation
+
+The kspace style ewald cannot be used in 2d simulations.  You can use
+2d EwaldDipole in a 3d simulation; see the kspace_modify command.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use nonperiodic boundaries with EwaldDipole
+
+For kspace style ewald, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab EwaldDipole
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with EwaldDipole.
+
+E: Cannot (yet) use EwaldDipole with triclinic box and slab correction
+
+This feature is not yet supported.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range Coulombic or dispersion components be used.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Must use 'kspace_modify gewald' for uncharged system
+
+UNDOCUMENTED
+
+E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems
+
+This option is not yet supported.
+
+*/
--- a/src/KSPACE/ewald_dipole_spin.cpp
+++ b/src/KSPACE/ewald_dipole_spin.cpp
@ -0,0 +1,857 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "ewald_dipole_spin.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "pair.h"
+#include "domain.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define SMALL 0.00001
+
+/* ---------------------------------------------------------------------- */
+
+EwaldDipoleSpin::EwaldDipoleSpin(LAMMPS *lmp) : 
+  EwaldDipole(lmp)
+{
+  dipoleflag = 0;
+  spinflag = 1;
+  
+  hbar = force->hplanck/MY_2PI;         	// eV/(rad.THz)
+  mub = 9.274e-4;                     		// in A.Ang^2
+  mu_0 = 785.15;               			// in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV.Ang^3
+  mub2mu0hbinv = mub2mu0 / hbar;        	// in rad.THz
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+EwaldDipoleSpin::~EwaldDipoleSpin() {}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::init()
+{
+  if (comm->me == 0) {
+    if (screen) fprintf(screen,"EwaldDipoleSpin initialization ...\n");
+    if (logfile) fprintf(logfile,"EwaldDipoleSpin initialization ...\n");
+  }
+
+  // error check
+  
+  spinflag = atom->sp?1:0;
+
+  triclinic_check();
+  
+  // no triclinic ewald spin (yet)
+  
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot (yet) use EwaldDipoleSpin with triclinic box");
+  
+  if (domain->dimension == 2)
+    error->all(FLERR,"Cannot use EwaldDipoleSpin with 2d simulation");
+
+  if (!atom->sp) error->all(FLERR,"Kspace style requires atom attribute sp");
+
+  if ((spinflag && strcmp(update->unit_style,"metal")) != 0)
+    error->all(FLERR,"'metal' units have to be used with spins");
+  
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with EwaldDipoleSpin");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab EwaldDipoleSpin");
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  pair_check();
+
+  int itmp;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  double cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+  // qdist = offset only for TIP4P fictitious charge
+
+  //qdist = 0.0;
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with EwaldDipoleSpin");
+
+  // compute musum & musqsum and warn if no spin
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  spsum_musq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // setup K-space resolution
+
+  bigint natoms = atom->natoms;
+
+  // use xprd,yprd,zprd even if triclinic so grid size is the same
+  // adjust z dimension for 2d slab EwaldDipoleSpin
+  // 3d EwaldDipoleSpin just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    
+    // initial guess with old method
+    
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*mu2);
+    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
+    
+    // try Newton solver
+
+    double g_ewald_new =
+      NewtonSolve(g_ewald,cutoff,natoms,xprd*yprd*zprd,mu2);
+    if (g_ewald_new > 0.0) g_ewald = g_ewald_new;
+    else error->warning(FLERR,"Ewald/disp Newton solver failed, "
+                        "using old method to estimate g_ewald");  
+  }
+
+  // setup EwaldDipoleSpin coefficients so can print stats
+
+  setup();
+
+  // final RMS accuracy
+
+  double lprx = rms(kxmax_orig,xprd,natoms,q2);
+  double lpry = rms(kymax_orig,yprd,natoms,q2);
+  double lprz = rms(kzmax_orig,zprd_slab,natoms,q2);
+  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
+  double spr = 2.0 *q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
+  double tpr = estimate_table_accuracy(q2_over_sqrt,spr);
+  double estimated_accuracy = sqrt(lpr*lpr + spr*spr + tpr*tpr);
+
+  // stats
+
+  if (comm->me == 0) {
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(screen,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(logfile,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   adjust EwaldDipoleSpin coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::setup()
+{
+  // volume-dependent factors
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+
+  // adjustment of z dimension for 2d slab EwaldDipoleSpin
+  // 3d EwaldDipoleSpin just uses zprd since slab_volfactor = 1.0
+
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  unitk[0] = 2.0*MY_PI/xprd;
+  unitk[1] = 2.0*MY_PI/yprd;
+  unitk[2] = 2.0*MY_PI/zprd_slab;
+
+  int kmax_old = kmax;
+
+  if (kewaldflag == 0) {
+
+    // determine kmax
+    // function of current box size, accuracy, G_ewald (short-range cutoff)
+
+    bigint natoms = atom->natoms;
+    double err;
+    kxmax = 1;
+    kymax = 1;
+    kzmax = 1;	
+
+    // set kmax in 3 directions to respect accuracy
+
+    err = rms_dipole(kxmax,xprd,natoms);
+    while (err > accuracy) {
+      kxmax++;
+      err = rms_dipole(kxmax,xprd,natoms);
+    }
+
+    err = rms_dipole(kymax,yprd,natoms);
+    while (err > accuracy) {
+      kymax++;
+      err = rms_dipole(kymax,yprd,natoms);
+    }
+
+    err = rms_dipole(kzmax,zprd,natoms);
+    while (err > accuracy) {
+      kzmax++;
+      err = rms_dipole(kzmax,zprd,natoms);
+    }
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+  } else {
+
+    kxmax = kx_ewald;
+    kymax = ky_ewald;
+    kzmax = kz_ewald;
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+  }
+
+  gsqmx *= 1.00001;
+
+  // if size has grown, reallocate k-dependent and nlocal-dependent arrays
+
+  if (kmax > kmax_old) {
+    deallocate();
+    allocate();
+    group_allocate_flag = 0;
+
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole_spin:ek");
+    memory->create(tk,nmax,3,"ewald_dipole_spin:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole_spin:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole_spin:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole_spin:sn");
+    kmax_created = kmax;
+  }
+
+  // pre-compute EwaldDipoleSpin coefficients
+
+  coeffs();
+}
+
+/* ----------------------------------------------------------------------
+   compute the EwaldDipoleSpin long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::compute(int eflag, int vflag)
+{
+  int i,j,k;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+  double spx, spy, spz;
+
+  // set energy/virial flags
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    spsum_musq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no charges
+
+  if (musqsum == 0.0) return;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole_spin:ek");
+    memory->create(tk,nmax,3,"ewald_dipole_spin:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole_spin:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole_spin:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole_spin:sn");
+    kmax_created = kmax;
+  }
+
+  // partial structure factors on each processor
+  // total structure factor by summing over procs
+
+  eik_dot_r();
+
+  MPI_Allreduce(sfacrl,sfacrl_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(sfacim,sfacim_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+
+  // K-space portion of electric field
+  // double loop over K-vectors and local atoms
+  // perform per-atom calculations if needed
+
+  double **f = atom->f;
+  double **fm_long = atom->fm_long;
+  double **sp = atom->sp;
+  int nlocal = atom->nlocal;
+
+  int kx,ky,kz;
+  double cypz,sypz,exprl,expim;
+  double partial,partial2,partial_peratom;
+  double vcik[6];
+  double mudotk;
+
+  for (i = 0; i < nlocal; i++) {
+    ek[i][0] = ek[i][1] = ek[i][2] = 0.0;
+    tk[i][0] = tk[i][1] = tk[i][2] = 0.0;
+  }
+
+  for (k = 0; k < kcount; k++) {
+    kx = kxvecs[k];
+    ky = kyvecs[k];
+    kz = kzvecs[k];
+    for (j = 0; j<6; j++) vc[k][j] = 0.0;
+
+    for (i = 0; i < nlocal; i++) {
+
+      for (j = 0; j<6; j++) vcik[j] = 0.0;
+
+      // re-evaluating sp dot k
+      
+      spx = sp[i][0]*sp[i][3];
+      spy = sp[i][1]*sp[i][3];
+      spz = sp[i][2]*sp[i][3];
+      mudotk = spx*kx*unitk[0] + spy*ky*unitk[1] + spz*kz*unitk[2];
+
+      // calculating  re and im of exp(i*k*ri)
+
+      cypz = cs[ky][1][i]*cs[kz][2][i] - sn[ky][1][i]*sn[kz][2][i];
+      sypz = sn[ky][1][i]*cs[kz][2][i] + cs[ky][1][i]*sn[kz][2][i];
+      exprl = cs[kx][0][i]*cypz - sn[kx][0][i]*sypz;
+      expim = sn[kx][0][i]*cypz + cs[kx][0][i]*sypz;
+
+      // taking im of struct_fact x exp(i*k*ri) (for force calc.)
+
+      partial = mudotk*(expim*sfacrl_all[k] - exprl*sfacim_all[k]);
+      ek[i][0] += partial * eg[k][0];
+      ek[i][1] += partial * eg[k][1];
+      ek[i][2] += partial * eg[k][2];
+
+      // compute field for torque calculation
+
+      partial_peratom = exprl*sfacrl_all[k] + expim*sfacim_all[k];
+      tk[i][0] += partial2*eg[k][0];
+      tk[i][1] += partial2*eg[k][1];
+      tk[i][2] += partial2*eg[k][2];
+
+      // total and per-atom virial correction
+      
+      vc[k][0] += vcik[0] = -(partial_peratom * spx * eg[k][0]);
+      vc[k][1] += vcik[1] = -(partial_peratom * spy * eg[k][1]);
+      vc[k][2] += vcik[2] = -(partial_peratom * spz * eg[k][2]);
+      vc[k][3] += vcik[3] = -(partial_peratom * spx * eg[k][1]);
+      vc[k][4] += vcik[4] = -(partial_peratom * spx * eg[k][2]);
+      vc[k][5] += vcik[5] = -(partial_peratom * spy * eg[k][2]);
+      
+      // taking re-part of struct_fact x exp(i*k*ri) 
+      // (for per-atom energy and virial calc.)
+
+      if (evflag_atom) {
+        if (eflag_atom) eatom[i] += mudotk*ug[k]*partial_peratom;
+        if (vflag_atom)
+          for (j = 0; j < 6; j++)
+	    vatom[i][j] += (ug[k]*mudotk*vg[k][j]*partial_peratom - vcik[j]);
+      }
+    }
+  }
+
+  // force and mag. precession vectors calculation
+  
+  const double spscale = mub2mu0 * scale;
+  const double spscale2 = mub2mu0hbinv * scale;
+
+  for (i = 0; i < nlocal; i++) {
+    f[i][0] += spscale * ek[i][0];
+    f[i][1] += spscale * ek[i][1];
+    if (slabflag != 2) f[i][2] += spscale * ek[i][2];
+    fm_long[i][0] += spscale2 * tk[i][0];
+    fm_long[i][1] += spscale2 * tk[i][1];
+    if (slabflag != 2) fm_long[i][2] += spscale2 * tk[i][3];
+  }
+  
+  // sum global energy across Kspace vevs and add in volume-dependent term
+  // taking the re-part of struct_fact_i x struct_fact_j
+  // substracting self energy and scaling
+
+  if (eflag_global) {
+    for (k = 0; k < kcount; k++) {
+      energy += ug[k] * (sfacrl_all[k]*sfacrl_all[k] +
+                         sfacim_all[k]*sfacim_all[k]);
+    }
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= spscale;
+  }
+
+  // global virial
+
+  if (vflag_global) {
+    double uk, vk;
+    for (k = 0; k < kcount; k++) {
+      uk = ug[k] * (sfacrl_all[k]*sfacrl_all[k] + sfacim_all[k]*sfacim_all[k]);
+      for (j = 0; j < 6; j++) virial[j] += uk*vg[k][j] - vc[k][j];
+    }
+    for (j = 0; j < 6; j++) virial[j] *= spscale;
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+	spx = sp[i][0]*sp[i][3];
+	spy = sp[i][1]*sp[i][3];
+	spz = sp[i][2]*sp[i][3];
+        eatom[i] -= (spx*spx + spy*spy + spz*spz)
+	  *2.0*g3/3.0/MY_PIS;
+        eatom[i] *= spscale;
+      }
+    }
+
+    if (vflag_atom)
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= spscale;
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   compute the struc. factors and mu dot k products
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::eik_dot_r()
+{
+  int i,k,l,m,n,ic;
+  double cstr1,sstr1,cstr2,sstr2,cstr3,sstr3,cstr4,sstr4;
+  double sqk,clpm,slpm;
+  double spx, spy, spz, spi;
+  double mudotk;
+
+  double **x = atom->x;
+  double **sp = atom->sp;
+  int nlocal = atom->nlocal;
+
+  n = 0;
+  spi = spx = spy = spz = 0.0;
+
+  // loop on different k-directions
+  // loop on n kpoints and nlocal atoms
+  // store (n x nlocal) tab. of values of (mu_i dot k)
+  // store n values of sum_j[ (mu_j dot k) exp(-k dot r_j) ]
+
+  // (k,0,0), (0,l,0), (0,0,m)
+  
+  // loop 1: k=1, l=1, m=1
+  // define first val. of cos and sin
+
+  for (ic = 0; ic < 3; ic++) {
+    sqk = unitk[ic]*unitk[ic];
+    if (sqk <= gsqmx) {
+      cstr1 = 0.0;
+      sstr1 = 0.0;
+      for (i = 0; i < nlocal; i++) {
+        cs[0][ic][i] = 1.0;
+        sn[0][ic][i] = 0.0;
+        cs[1][ic][i] = cos(unitk[ic]*x[i][ic]);
+        sn[1][ic][i] = sin(unitk[ic]*x[i][ic]);
+        cs[-1][ic][i] = cs[1][ic][i];
+        sn[-1][ic][i] = -sn[1][ic][i];
+	spi = sp[i][ic]*sp[i][3];
+        mudotk = (spi*unitk[ic]);
+        cstr1 += mudotk*cs[1][ic][i];
+        sstr1 += mudotk*sn[1][ic][i];
+      }
+      sfacrl[n] = cstr1;
+      sfacim[n++] = sstr1;
+    }
+  }
+
+  // loop 2: k>1, l>1, m>1
+
+  for (m = 2; m <= kmax; m++) {
+    for (ic = 0; ic < 3; ic++) {
+      sqk = m*unitk[ic] * m*unitk[ic];
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+          cs[m][ic][i] = cs[m-1][ic][i]*cs[1][ic][i] -
+            sn[m-1][ic][i]*sn[1][ic][i];
+          sn[m][ic][i] = sn[m-1][ic][i]*cs[1][ic][i] +
+            cs[m-1][ic][i]*sn[1][ic][i];
+          cs[-m][ic][i] = cs[m][ic][i];
+          sn[-m][ic][i] = -sn[m][ic][i];
+	  spi = sp[i][ic]*sp[i][3];
+	  mudotk = (spi*m*unitk[ic]);
+          cstr1 += mudotk*cs[m][ic][i];
+          sstr1 += mudotk*sn[m][ic][i];
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+      }
+    }
+  }
+
+  // 1 = (k,l,0), 2 = (k,-l,0)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  spx = sp[i][0]*sp[i][3];
+	  spy = sp[i][1]*sp[i][3];
+
+	  // dir 1: (k,l,0)
+	  mudotk = (spx*k*unitk[0] + spy*l*unitk[1]);
+          cstr1 += mudotk*(cs[k][0][i]*cs[l][1][i]-sn[k][0][i]*sn[l][1][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[l][1][i]+cs[k][0][i]*sn[l][1][i]);
+	  
+	  // dir 2: (k,-l,0)
+	  mudotk = (spx*k*unitk[0] - spy*l*unitk[1]);
+          cstr2 += mudotk*(cs[k][0][i]*cs[l][1][i]+sn[k][0][i]*sn[l][1][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[l][1][i]-cs[k][0][i]*sn[l][1][i]);
+	}
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (0,l,m), 2 = (0,l,-m)
+
+  for (l = 1; l <= kymax; l++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (l*unitk[1] * l*unitk[1]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  spy = sp[i][1]*sp[i][3];
+	  spz = sp[i][2]*sp[i][3];
+
+	  // dir 1: (0,l,m)
+      	  mudotk = (spy*l*unitk[1] + spz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i]);
+	  
+	  // dir 2: (0,l,-m)
+	  mudotk = (spy*l*unitk[1] - spz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[l][1][i]*cs[m][2][i]+sn[l][1][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[l][1][i]*cs[m][2][i]-cs[l][1][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,0,m), 2 = (k,0,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+      	  spx = sp[i][0]*sp[i][3];
+	  spz = sp[i][2]*sp[i][3];
+
+	  // dir 1: (k,0,m)
+	  mudotk = (spx*k*unitk[0] + spz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[k][0][i]*cs[m][2][i]-sn[k][0][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[m][2][i]+cs[k][0][i]*sn[m][2][i]);
+	  
+	  // dir 2: (k,0,-m)
+	  mudotk = (spx*k*unitk[0] - spz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[k][0][i]*cs[m][2][i]+sn[k][0][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[m][2][i]-cs[k][0][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      for (m = 1; m <= kzmax; m++) {
+        sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]) +
+          (m*unitk[2] * m*unitk[2]);
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          cstr2 = 0.0;
+          sstr2 = 0.0;
+          cstr3 = 0.0;
+          sstr3 = 0.0;
+          cstr4 = 0.0;
+          sstr4 = 0.0;
+          for (i = 0; i < nlocal; i++) {
+      	    spx = sp[i][0]*sp[i][3];
+	    spy = sp[i][1]*sp[i][3];
+	    spz = sp[i][2]*sp[i][3];
+
+	    // dir 1: (k,l,m)
+	    mudotk = (spx*k*unitk[0] + spy*l*unitk[1] + spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr1 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr1 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 2: (k,-l,m)
+	    mudotk = (spx*k*unitk[0] - spy*l*unitk[1] + spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr2 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr2 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 3: (k,l,-m)
+	    mudotk = (spx*k*unitk[0] + spy*l*unitk[1] - spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr3 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr3 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 4: (k,-l,-m)
+	    mudotk = (spx*k*unitk[0] - spy*l*unitk[1] - spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr4 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr4 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+          }
+          sfacrl[n] = cstr1;
+          sfacim[n++] = sstr1;
+          sfacrl[n] = cstr2;
+          sfacim[n++] = sstr2;
+          sfacrl[n] = cstr3;
+          sfacim[n++] = sstr3;
+          sfacrl[n] = cstr4;
+          sfacim[n++] = sstr4;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D EwaldDipoleSpin if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::slabcorr()
+{
+  // compute local contribution to global dipole/spin moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double spin = 0.0;
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  for (int i = 0; i < nlocal; i++) { 
+    spz = sp[i][2]*sp[i][3];
+    spin += spz;
+  }
+
+  // sum local contributions to get global spin moment
+
+  double spin_all;
+  MPI_Allreduce(&spin,&spin_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  if (eflag_atom || fabs(qsum) > SMALL) {
+
+    error->all(FLERR,"Cannot (yet) use kspace slab correction with "
+      "long-range spins and non-neutral systems or per-atom energy");
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(spin_all*spin_all/12.0)/volume;
+  const double spscale = mub2mu0 * scale;
+
+  if (eflag_global) energy += spscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = spscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++) {
+      spz = sp[i][2]*sp[i][3];
+      eatom[i] += efact * spz * spin_all;
+    }
+  }
+
+  // add on mag. force corrections
+
+  double ffact = spscale * (-4.0*MY_PI/volume);
+  double **fm_long = atom->fm_long;
+  for (int i = 0; i < nlocal; i++) {
+    fm_long[i][2] += ffact * spin_all;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute musum,musqsum,mu2 for magnetic spins
+   called initially, when particle count changes, when spins are changed
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::spsum_musq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->sp_flag) {
+    double** sp = atom->sp;
+    double spx,spy,spz;
+    double musum_local(0.0), musqsum_local(0.0);
+
+    for (int i = 0; i < nlocal; i++) {
+      spx = sp[i][0]*sp[i][3];
+      spy = sp[i][1]*sp[i][3];
+      spz = sp[i][2]*sp[i][3];
+      musum_local += spx + spy + spz;
+      musqsum_local += spx*spx + spy*spy + spz*spz;
+    }
+
+    MPI_Allreduce(&musum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&musqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    //mu2 = musqsum * mub2mu0;
+    mu2 = musqsum;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver EwaldDipoleSpin on system with no spins");
+}
--- a/src/KSPACE/ewald_dipole_spin.h
+++ b/src/KSPACE/ewald_dipole_spin.h
@ -0,0 +1,102 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(ewald/dipole/spin,EwaldDipoleSpin)
+
+#else
+
+#ifndef LMP_EWALD_DIPOLE_SPIN_H
+#define LMP_EWALD_DIPOLE_SPIN_H
+
+#include "ewald_dipole.h"
+
+namespace LAMMPS_NS {
+
+class EwaldDipoleSpin : public EwaldDipole {
+ public:
+  EwaldDipoleSpin(class LAMMPS *);
+  virtual ~EwaldDipoleSpin();
+  void init();
+  void setup();
+  void compute(int, int);
+
+ protected:
+  double hbar;                  // reduced Planck's constant      
+  double mub;                   // Bohr's magneton                
+  double mu_0;                  // vacuum permeability
+  double mub2mu0;               // prefactor for mech force
+  double mub2mu0hbinv;          // prefactor for mag force
+
+  void spsum_musq(); 
+  virtual void eik_dot_r();
+  void slabcorr();
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Cannot use EwaldDipoleSpin with 2d simulation
+
+The kspace style ewald cannot be used in 2d simulations.  You can use
+2d EwaldDipoleSpin in a 3d simulation; see the kspace_modify command.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use nonperiodic boundaries with EwaldDipoleSpin
+
+For kspace style ewald, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab EwaldDipoleSpin
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with EwaldDipoleSpin.
+
+E: Cannot (yet) use EwaldDipoleSpin with triclinic box and slab correction
+
+This feature is not yet supported.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range Coulombic or dispersion components be used.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Must use 'kspace_modify gewald' for uncharged system
+
+UNDOCUMENTED
+
+E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems
+
+This option is not yet supported.
+
+*/
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@ -1430,12 +1430,13 @@ void PPPM::set_grid_local()
  double zprd = prd[2];
  double zprd_slab = zprd*slab_volfactor;

-  double dist[3];
+  double dist[3] = {0.0,0.0,0.0};
  double cuthalf = 0.5*neighbor->skin + qdist;
  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
  else kspacebbox(cuthalf,&dist[0]);

  int nlo,nhi;
+  nlo = nhi = 0;

  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
                            nx_pppm/xprd + shift) - OFFSET;
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@ -42,7 +42,7 @@ class PPPM : public KSpace {
  virtual void settings(int, char **);
  virtual void init();
  virtual void setup();
-  void setup_grid();
+  virtual void setup_grid();
  virtual void compute(int, int);
  virtual int timing_1d(int, double &);
  virtual int timing_3d(int, double &);
@ -106,10 +106,10 @@ class PPPM : public KSpace {
  double qdist;                // distance from O site to negative charge
  double alpha;                // geometric factor

-  void set_grid_global();
+  virtual void set_grid_global();
  void set_grid_local();
  void adjust_gewald();
-  double newton_raphson_f();
+  virtual double newton_raphson_f();
  double derivf();
  double final_accuracy();

@ -146,7 +146,7 @@ class PPPM : public KSpace {
  void compute_drho1d(const FFT_SCALAR &, const FFT_SCALAR &,
                     const FFT_SCALAR &);
  void compute_rho_coeff();
-  void slabcorr();
+  virtual void slabcorr();

  // grid communication

--- a/src/KSPACE/pppm_dipole.cpp
+++ b/src/KSPACE/pppm_dipole.cpp
--- a/src/KSPACE/pppm_dipole.h
+++ b/src/KSPACE/pppm_dipole.h
@ -0,0 +1,217 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/dipole,PPPMDipole)
+
+#else
+
+#ifndef LMP_PPPM_DIPOLE_H
+#define LMP_PPPM_DIPOLE_H
+
+#include "pppm.h"
+
+namespace LAMMPS_NS {
+
+class PPPMDipole : public PPPM {
+ public:
+  PPPMDipole(class LAMMPS *);
+  virtual ~PPPMDipole();
+  void init();
+  void setup();
+  void setup_grid();
+  void compute(int, int);
+  int timing_1d(int, double &);
+  int timing_3d(int, double &);
+  double memory_usage();
+
+ protected:
+  void set_grid_global();
+  double newton_raphson_f(); 
+
+  void allocate();
+  void allocate_peratom();
+  void deallocate();
+  void deallocate_peratom();
+  void compute_gf_denom();
+
+  void slabcorr();
+
+  // grid communication
+
+  void pack_forward(int, FFT_SCALAR *, int, int *);
+  void unpack_forward(int, FFT_SCALAR *, int, int *);
+  void pack_reverse(int, FFT_SCALAR *, int, int *);
+  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+
+  // dipole
+
+  FFT_SCALAR ***densityx_brick_dipole,***densityy_brick_dipole,***densityz_brick_dipole;
+  FFT_SCALAR ***vdxx_brick_dipole,***vdyy_brick_dipole,***vdzz_brick_dipole;
+  FFT_SCALAR ***vdxy_brick_dipole,***vdxz_brick_dipole,***vdyz_brick_dipole;
+  FFT_SCALAR ***ux_brick_dipole,***uy_brick_dipole,***uz_brick_dipole;
+  FFT_SCALAR ***v0x_brick_dipole,***v1x_brick_dipole,***v2x_brick_dipole;
+  FFT_SCALAR ***v3x_brick_dipole,***v4x_brick_dipole,***v5x_brick_dipole;
+  FFT_SCALAR ***v0y_brick_dipole,***v1y_brick_dipole,***v2y_brick_dipole;
+  FFT_SCALAR ***v3y_brick_dipole,***v4y_brick_dipole,***v5y_brick_dipole;
+  FFT_SCALAR ***v0z_brick_dipole,***v1z_brick_dipole,***v2z_brick_dipole;
+  FFT_SCALAR ***v3z_brick_dipole,***v4z_brick_dipole,***v5z_brick_dipole;
+  FFT_SCALAR *work3,*work4;
+  FFT_SCALAR *densityx_fft_dipole,*densityy_fft_dipole,*densityz_fft_dipole;
+  class GridComm *cg_dipole;
+  class GridComm *cg_peratom_dipole;
+  int only_dipole_flag;
+  double musum,musqsum,mu2;
+  double find_gewald_dipole(double, double, bigint, double, double);
+  double newton_raphson_f_dipole(double, double, bigint, double, double);
+  double derivf_dipole(double, double, bigint, double, double);
+  double compute_df_kspace_dipole();
+  double compute_qopt_dipole();
+  void compute_gf_dipole();
+  void make_rho_dipole();
+  void brick2fft_dipole();
+  void poisson_ik_dipole();
+  void poisson_peratom_dipole();
+  void fieldforce_ik_dipole();
+  void fieldforce_peratom_dipole();
+  double final_accuracy_dipole();
+  void musum_musq();
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Cannot (yet) use charges with Kspace style PPPMDipole
+
+Charge-dipole interactions are not yet implemented in PPPMDipole so this
+feature is not yet supported.
+
+E: Must redefine kspace_style after changing to triclinic box
+
+Self-explanatory.
+
+E: Kspace style requires atom attribute mu
+
+The atom style defined does not have this attribute.
+
+E: Cannot (yet) use kspace_modify diff ad with dipoles
+
+This feature is not yet supported.
+
+E: Cannot (yet) use 'electron' units with dipoles
+
+This feature is not yet supported.
+
+E: Cannot yet use triclinic cells with PPPMDipole
+
+This feature is not yet supported.
+
+E: Cannot yet use TIP4P with PPPMDipole
+
+This feature is not yet supported.
+
+E: Cannot use nonperiodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range dipole components be used.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired. See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2. This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: Cannot (yet) compute per-atom virial with kspace style pppm/dipole
+
+This feature is not yet supported.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096. You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor. This is likely for one of two
+reasons, both of them bad. First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors. This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command. The safest settings are "delay 0
+every 1 check yes". Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+E: Using kspace solver PPPMDipole on system with no dipoles
+
+Must have non-zero dipoles with PPPMDipole.
+
+E: Must use kspace_modify gewald for system with no dipoles
+
+Self-explanatory.
+
+*/
--- a/src/KSPACE/pppm_dipole_spin.cpp
+++ b/src/KSPACE/pppm_dipole_spin.cpp
@ -0,0 +1,755 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Julien Tranchida (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include "pppm_dipole_spin.h"
+#include "atom.h"
+#include "comm.h"
+#include "gridcomm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define LARGE 10000.0
+#define SMALL 0.00001
+#define EPS_HOC 1.0e-7
+
+enum{REVERSE_MU};
+enum{FORWARD_MU,FORWARD_MU_PERATOM};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDipoleSpin::PPPMDipoleSpin(LAMMPS *lmp) : 
+  PPPMDipole(lmp)
+{
+  dipoleflag = 0;
+  spinflag = 1;
+  
+  hbar = force->hplanck/MY_2PI;         	// eV/(rad.THz)
+  mub = 9.274e-4;                     		// in A.Ang^2
+  mu_0 = 785.15;               			// in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV.Ang^3
+  mub2mu0hbinv = mub2mu0 / hbar;        	// in rad.THz
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+PPPMDipoleSpin::~PPPMDipoleSpin()
+{
+  if (copymode) return;
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  fft1 = NULL;
+  fft2 = NULL;
+  remap = NULL;
+  cg_dipole = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::init()
+{
+  if (me == 0) {
+    if (screen) fprintf(screen,"PPPMDipoleSpin initialization ...\n");
+    if (logfile) fprintf(logfile,"PPPMDipoleSpin initialization ...\n");
+  }
+
+  // error check
+
+  spinflag = atom->sp?1:0;
+  
+  triclinic_check();
+
+  if (triclinic != domain->triclinic)
+    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");
+
+  if (domain->dimension == 2) error->all(FLERR,
+                                         "Cannot use PPPMDipoleSpin with 2d simulation");
+  if (comm->style != 0)
+    error->universe_all(FLERR,"PPPMDipoleSpin can only currently be used with "
+                        "comm_style brick");
+
+  if (!atom->sp) error->all(FLERR,"Kspace style requires atom attribute sp");
+
+  if (atom->sp && differentiation_flag == 1) error->all(FLERR,"Cannot (yet) use"
+     " kspace_modify diff ad with spins");
+
+  if (spinflag && strcmp(update->unit_style,"metal") != 0)
+    error->all(FLERR,"'metal' units have to be used with spins");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipoleSpin");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPMDipoleSpin");
+  }
+
+  if (order < 2 || order > MAXORDER) {
+    char str[128];
+    sprintf(str,"PPPMDipoleSpin order cannot be < 2 or > than %d",MAXORDER);
+    error->all(FLERR,str);
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot yet use triclinic cells with PPPMDipoleSpin");
+
+  pair_check();
+
+  int itmp = 0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  // check the correct extract here
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+  // qdist = offset only for TIP4P fictitious charge
+  
+  qdist = 0.0; 
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with PPPMDipoleSpin");
+
+  scale = 1.0;
+  spsum_spsq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  // is two_charge_force still relevant for spin systems?
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil does not extend beyond neighbor proc
+  //   or overlap is allowed, then done
+  // else reduce order and try again
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  GridComm *cgtmp = NULL;
+  int iteration = 0;
+    
+  while (order >= minorder) {
+    if (iteration && me == 0)
+      error->warning(FLERR,"Reducing PPPMDipoleSpin order b/c stencil extends "
+                     "beyond nearest neighbor processor");
+
+    compute_gf_denom();
+    set_grid_global();
+    set_grid_local();
+    if (overlap_allowed) break;
+
+    cgtmp = new GridComm(lmp,world,1,1,
+                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    cgtmp->ghost_notify();
+    if (!cgtmp->ghost_overlap()) break;
+    delete cgtmp;
+
+    order--;
+    iteration++;
+  }
+
+  if (order < minorder) error->all(FLERR,"PPPMDipoleSpin order < minimum allowed order");
+  if (!overlap_allowed && cgtmp->ghost_overlap())
+    error->all(FLERR,"PPPMDipoleSpin grid stencil extends "
+               "beyond nearest neighbor processor");
+  if (cgtmp) delete cgtmp;
+
+  // adjust g_ewald
+
+  if (!gewaldflag) adjust_gewald();
+
+  // calculate the final accuracy
+
+  double estimated_accuracy = final_accuracy_dipole();
+
+  // print stats
+
+  int ngrid_max,nfft_both_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+
+#ifdef FFT_SINGLE
+    const char fft_prec[] = "single";
+#else
+    const char fft_prec[] = "double";
+#endif
+
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(screen,"  stencil order = %d\n",order);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  using %s precision FFTs\n",fft_prec);
+      fprintf(screen,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(logfile,"  stencil order = %d\n",order);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  using %s precision FFTs\n",fft_prec);
+      fprintf(logfile,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+  }
+
+  // allocate K-space dependent memory
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+  cg_dipole->ghost_notify();
+  cg_dipole->setup();
+
+  // pre-compute Green's function denominator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMDipoleSpin long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  if (vflag_atom)
+    error->all(FLERR,"Cannot (yet) compute per-atom virial "
+                       "with kspace style pppm/dipole/spin");
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    cg_peratom_dipole->ghost_notify();
+    cg_peratom_dipole->setup();
+  }
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    spsum_spsq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no spins
+
+  if (musqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  boxlo = domain->boxlo;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm_spin:part2grid");
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d on-grid density
+
+  particle_map();
+  make_rho_spin();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  cg_dipole->reverse_comm(this,REVERSE_MU);
+  brick2fft_dipole();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atom calculations via poisson_peratom()
+
+  poisson_ik_dipole();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  cg_dipole->forward_comm(this,FORWARD_MU);
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) {
+    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
+  }
+
+  // calculate the force on my particles
+
+  fieldforce_ik_spin();
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom_spin();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  const double spscale = mub2mu0 * scale;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= spscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*spscale*volume*virial_all[i];
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    double **sp = atom->sp;
+    double spx,spy,spz;
+    int nlocal = atom->nlocal;
+    int ntotal = nlocal;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+	spx = sp[i][0]*sp[i][3];
+	spy = sp[i][1]*sp[i][3];
+	spz = sp[i][2]*sp[i][3];
+        eatom[i] *= 0.5;
+        eatom[i] -= (spx*spx + spy*spy + spz*spz)*2.0*g3/3.0/MY_PIS;
+        eatom[i] *= spscale;
+      }
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < ntotal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*spscale;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::make_rho_spin()
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR x1,y1,z1;
+  FFT_SCALAR x2,y2,z2;
+
+  // clear 3d density array
+
+  memset(&(densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    spx = sp[i][0]*sp[i][3];
+    spy = sp[i][1]*sp[i][3];
+    spz = sp[i][2]*sp[i][3];
+    z0 = delvolinv * spx;
+    z1 = delvolinv * spy;
+    z2 = delvolinv * spz;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*rho1d[2][n];
+      y1 = z1*rho1d[2][n];
+      y2 = z2*rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*rho1d[1][m];
+        x1 = y1*rho1d[1][m];
+        x2 = y2*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          densityx_brick_dipole[mz][my][mx] += x0*rho1d[0][l];
+          densityy_brick_dipole[mz][my][mx] += x1*rho1d[0][l];
+          densityz_brick_dipole[mz][my][mx] += x2*rho1d[0][l];
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get magnetic field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::fieldforce_ik_spin()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR ex,ey,ez;
+  FFT_SCALAR vxx,vyy,vzz,vxy,vxz,vyz;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm_long = atom->fm_long;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ex = ey = ez = ZEROF;
+    vxx = vyy = vzz = vxy = vxz = vyz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          ex -= x0*ux_brick_dipole[mz][my][mx];
+          ey -= x0*uy_brick_dipole[mz][my][mx];
+          ez -= x0*uz_brick_dipole[mz][my][mx];
+          vxx -= x0*vdxx_brick_dipole[mz][my][mx];
+          vyy -= x0*vdyy_brick_dipole[mz][my][mx];
+          vzz -= x0*vdzz_brick_dipole[mz][my][mx];
+          vxy -= x0*vdxy_brick_dipole[mz][my][mx];
+          vxz -= x0*vdxz_brick_dipole[mz][my][mx];
+          vyz -= x0*vdyz_brick_dipole[mz][my][mx];
+        }
+      }
+    }
+
+    // convert M-field and store mech. forces
+
+    const double spfactor = mub2mu0 * scale;
+    spx = sp[i][0]*sp[i][3];
+    spy = sp[i][1]*sp[i][3];
+    spz = sp[i][2]*sp[i][3];
+    f[i][0] += spfactor*(vxx*spx + vxy*spy + vxz*spz);
+    f[i][1] += spfactor*(vxy*spx + vyy*spy + vyz*spz);
+    f[i][2] += spfactor*(vxz*spx + vyz*spy + vzz*spz);
+
+    // store long-range mag. precessions
+    
+    const double spfactorh = mub2mu0hbinv * scale;
+    fm_long[i][0] += spfactorh*ex;
+    fm_long[i][1] += spfactorh*ey;
+    fm_long[i][2] += spfactorh*ez;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::fieldforce_peratom_spin()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ux,uy,uz;
+  FFT_SCALAR v0x,v1x,v2x,v3x,v4x,v5x;
+  FFT_SCALAR v0y,v1y,v2y,v3y,v4y,v5y;
+  FFT_SCALAR v0z,v1z,v2z,v3z,v4z,v5z;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  double **x = atom->x;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ux = uy = uz = ZEROF;
+    v0x = v1x = v2x = v3x = v4x = v5x = ZEROF;
+    v0y = v1y = v2y = v3y = v4y = v5y = ZEROF;
+    v0z = v1z = v2z = v3z = v4z = v5z = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          if (eflag_atom) {
+            ux += x0*ux_brick_dipole[mz][my][mx];
+            uy += x0*uy_brick_dipole[mz][my][mx];
+            uz += x0*uz_brick_dipole[mz][my][mx];
+          }
+          if (vflag_atom) {
+            v0x += x0*v0x_brick_dipole[mz][my][mx];
+            v1x += x0*v1x_brick_dipole[mz][my][mx];
+            v2x += x0*v2x_brick_dipole[mz][my][mx];
+            v3x += x0*v3x_brick_dipole[mz][my][mx];
+            v4x += x0*v4x_brick_dipole[mz][my][mx];
+            v5x += x0*v5x_brick_dipole[mz][my][mx];
+            v0y += x0*v0y_brick_dipole[mz][my][mx];
+            v1y += x0*v1y_brick_dipole[mz][my][mx];
+            v2y += x0*v2y_brick_dipole[mz][my][mx];
+            v3y += x0*v3y_brick_dipole[mz][my][mx];
+            v4y += x0*v4y_brick_dipole[mz][my][mx];
+            v5y += x0*v5y_brick_dipole[mz][my][mx];
+            v0z += x0*v0z_brick_dipole[mz][my][mx];
+            v1z += x0*v1z_brick_dipole[mz][my][mx];
+            v2z += x0*v2z_brick_dipole[mz][my][mx];
+            v3z += x0*v3z_brick_dipole[mz][my][mx];
+            v4z += x0*v4z_brick_dipole[mz][my][mx];
+            v5z += x0*v5z_brick_dipole[mz][my][mx];
+          }
+        }
+      }
+    }
+
+    spx = sp[i][0]*sp[i][3];
+    spy = sp[i][1]*sp[i][3];
+    spz = sp[i][2]*sp[i][3];
+    if (eflag_atom) eatom[i] += spx*ux + spy*uy + spz*uz;
+    if (vflag_atom) {
+      vatom[i][0] += spx*v0x + spy*v0y + spz*v0z;
+      vatom[i][1] += spx*v1x + spy*v1y + spz*v1z;
+      vatom[i][2] += spx*v2x + spy*v2y + spz*v2z;
+      vatom[i][3] += spx*v3x + spy*v3y + spz*v3z;
+      vatom[i][4] += spx*v4x + spy*v4y + spz*v4z;
+      vatom[i][5] += spx*v5x + spy*v5y + spz*v5z;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::slabcorr()
+{
+  // compute local contribution to global spin moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double spin = 0.0;
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  for (int i = 0; i < nlocal; i++) { 
+    spz = sp[i][2]*sp[i][3];
+    spin += spz;
+  }
+
+  // sum local contributions to get global spin moment
+
+  double spin_all;
+  MPI_Allreduce(&spin,&spin_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(spin_all*spin_all/12.0)/volume;
+  const double spscale = mub2mu0 * scale;
+
+  if (eflag_global) energy += spscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = spscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++) {
+      spz = sp[i][2]*sp[i][3];
+      eatom[i] += efact * spz * spin_all;
+    }
+  }
+
+  // add on mag. force corrections
+
+  double ffact = spscale * (-4.0*MY_PI/volume);
+  double **fm_long = atom->fm_long;
+  for (int i = 0; i < nlocal; i++) {
+    fm_long[i][2] += ffact * spin_all;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute spsum,spsqsum,sp2
+   called initially, when particle count changes, when spins are changed
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::spsum_spsq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->sp_flag) {
+    double **sp = atom->sp;
+    double spx, spy, spz;
+    double spsum_local(0.0), spsqsum_local(0.0);
+
+    // sum (direction x norm) of all spins
+
+    for (int i = 0; i < nlocal; i++) {
+      spx = sp[i][0]*sp[i][3];
+      spy = sp[i][1]*sp[i][3];
+      spz = sp[i][2]*sp[i][3];
+      spsum_local += spx + spy + spz;
+      spsqsum_local += spx*spx + spy*spy + spz*spz;
+    }
+
+    // store results into pppm_dipole quantities 
+
+    MPI_Allreduce(&spsum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&spsqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    //mu2 = musqsum * mub2mu0;
+    mu2 = musqsum;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver PPPMDipoleSpin on system with no spins");
+}
--- a/src/KSPACE/pppm_dipole_spin.h
+++ b/src/KSPACE/pppm_dipole_spin.h
@ -0,0 +1,176 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/dipole/spin,PPPMDipoleSpin)
+
+#else
+
+#ifndef LMP_PPPM_DIPOLE_SPIN_H
+#define LMP_PPPM_DIPOLE_SPIN_H
+
+#include "pppm_dipole.h"
+
+namespace LAMMPS_NS {
+
+class PPPMDipoleSpin : public PPPMDipole {
+ public:
+  PPPMDipoleSpin(class LAMMPS *);
+  virtual ~PPPMDipoleSpin();
+  void init();
+  void compute(int, int);
+
+ protected:
+  double hbar;                  // reduced Planck's constant      
+  double mub;                   // Bohr's magneton                
+  double mu_0;                  // vacuum permeability
+  double mub2mu0;               // prefactor for mech force
+  double mub2mu0hbinv;          // prefactor for mag force
+
+  void slabcorr();
+
+  // spin
+
+  void make_rho_spin();
+  void fieldforce_ik_spin();
+  void fieldforce_peratom_spin();
+  void spsum_spsq();
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Cannot (yet) use charges with Kspace style PPPMDipoleSpin
+
+Charge-spin interactions are not yet implemented in PPPMDipoleSpin so this
+feature is not yet supported.
+
+E: Must redefine kspace_style after changing to triclinic box
+
+Self-explanatory.
+
+E: Kspace style requires atom attribute mu
+
+The atom style defined does not have this attribute.
+
+E: Cannot (yet) use kspace_modify diff ad with spins
+
+This feature is not yet supported.
+
+E: Cannot (yet) use 'electron' units with spins
+
+This feature is not yet supported.
+
+E: Cannot yet use triclinic cells with PPPMDipoleSpin
+
+This feature is not yet supported.
+
+E: Cannot yet use TIP4P with PPPMDipoleSpin
+
+This feature is not yet supported.
+
+E: Cannot use nonperiodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range spin components be used.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired. See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2. This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: Cannot (yet) compute per-atom virial with kspace style pppm/dipole/spin
+
+This feature is not yet supported.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096. You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor. This is likely for one of two
+reasons, both of them bad. First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors. This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command. The safest settings are "delay 0
+every 1 check yes". Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+E: Using kspace solver PPPMDipoleSpin on system with no spins
+
+Must have non-zero spins with PPPMDipoleSpin.
+
+E: Must use kspace_modify gewald for system with no spins
+
+Self-explanatory.
+
+*/
--- a/src/MOLECULE/bond_table.cpp
+++ b/src/MOLECULE/bond_table.cpp
@ -503,7 +503,7 @@ void BondTable::param_extract(Table *tb, char *line)
 void BondTable::bcast_table(Table *tb)
 {
  MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
-  MPI_Bcast(&tb->r0,1,MPI_INT,0,world);
+  MPI_Bcast(&tb->r0,1,MPI_DOUBLE,0,world);

  int me;
  MPI_Comm_rank(world,&me);
@ -522,7 +522,6 @@ void BondTable::bcast_table(Table *tb)
    MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
    MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
  }
-  MPI_Bcast(&tb->r0,1,MPI_INT,0,world);
 }

 /* ----------------------------------------------------------------------
--- a/src/SNAP/compute_sna_atom.cpp
+++ b/src/SNAP/compute_sna_atom.cpp
@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"

 using namespace LAMMPS_NS;

@ -45,7 +44,6 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :

  // default values

-  diagonalstyle = 0;
  rmin0 = 0.0;
  switchflag = 1;
  bzeroflag = 1;
@ -85,14 +83,7 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
  int iarg = nargmin;

  while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      diagonalstyle = atoi(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
      if (iarg+2 > narg)
        error->all(FLERR,"Illegal compute sna/atom command");
      rmin0 = atof(arg[iarg+1]);
@ -115,28 +106,16 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
    } else error->all(FLERR,"Illegal compute sna/atom command");
  }

-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   rmin0,switchflag,bzeroflag);

-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
  size_peratom_cols = ncoeff;
  if (quadraticflag) size_peratom_cols += (ncoeff*(ncoeff+1))/2;
  peratom_flag = 1;

  nmax = 0;
-  njmax = 0;
  sna = NULL;
-
 }

 /* ---------------------------------------------------------------------- */
@ -147,9 +126,7 @@ ComputeSNAAtom::~ComputeSNAAtom()
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }

 /* ---------------------------------------------------------------------- */
@ -176,13 +153,7 @@ void ComputeSNAAtom::init()
    if (strcmp(modify->compute[i]->style,"sna/atom") == 0) count++;
  if (count > 1 && comm->me == 0)
    error->warning(FLERR,"More than one compute sna/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }

 /* ---------------------------------------------------------------------- */
@ -223,11 +194,7 @@ void ComputeSNAAtom::compute_peratom()
  double** const x = atom->x;
  const int* const mask = atom->mask;

-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
  for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
    const int i = ilist[ii];
    if (mask[i] & groupbit) {

@ -241,7 +208,7 @@ void ComputeSNAAtom::compute_peratom()

      // insure rij, inside, and typej  are of size jnum

-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);

      // rij[][3] = displacements between atom I and those neighbors
      // inside = indices of neighbors of I within cutoff
@ -258,26 +225,25 @@ void ComputeSNAAtom::compute_peratom()
        const double rsq = delx*delx + dely*dely + delz*delz;
        int jtype = type[j];
        if (rsq < cutsq[itype][jtype] && rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
          ninside++;
        }
      }

-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
-      snaptr[tid]->compute_bi();
-      snaptr[tid]->copy_bi2bvec();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
+      snaptr->compute_bi();
      for (int icoeff = 0; icoeff < ncoeff; icoeff++)
-        sna[i][icoeff] = snaptr[tid]->bvec[icoeff];
+        sna[i][icoeff] = snaptr->blist[icoeff];
      if (quadraticflag) {
        int ncount = ncoeff;
        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          double bi = snaptr[tid]->bvec[icoeff];
+          double bi = snaptr->blist[icoeff];

          // diagonal element of quadratic matrix

@ -286,7 +252,7 @@ void ComputeSNAAtom::compute_peratom()
          // upper-triangular elements of quadratic matrix

          for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++)
-            sna[i][ncount++] = bi*snaptr[tid]->bvec[jcoeff];
+            sna[i][ncount++] = bi*snaptr->blist[jcoeff];
        }
      }
    } else {
@ -302,10 +268,9 @@ void ComputeSNAAtom::compute_peratom()

 double ComputeSNAAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
+  double bytes = nmax*size_peratom_cols * sizeof(double); // sna
+  bytes += snaptr->memory_usage();                        // SNA object
+
  return bytes;
 }

--- a/src/SNAP/compute_sna_atom.h
+++ b/src/SNAP/compute_sna_atom.h
@ -34,7 +34,7 @@ class ComputeSNAAtom : public Compute {
  double memory_usage();

 private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
  int ncoeff;
  double **cutsq;
  class NeighList *list;
@ -42,10 +42,9 @@ class ComputeSNAAtom : public Compute {
  double rcutfac;
  double *radelem;
  double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
  double cutmax;
  int quadraticflag;
-  int nthreads;
 };

 }
--- a/src/SNAP/compute_snad_atom.cpp
+++ b/src/SNAP/compute_snad_atom.cpp
@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"

 using namespace LAMMPS_NS;

@ -45,7 +44,6 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :

  // default values

-  diagonalstyle = 0;
  rmin0 = 0.0;
  switchflag = 1;
  bzeroflag = 1;
@ -83,14 +81,7 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
  int iarg = nargmin;

  while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      diagonalstyle = atof(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
      if (iarg+2 > narg)
        error->all(FLERR,"Illegal compute snad/atom command");
      rmin0 = atof(arg[iarg+1]);
@ -113,20 +104,10 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
    } else error->all(FLERR,"Illegal compute snad/atom command");
  }

-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   rmin0,switchflag,bzeroflag);

-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
  nperdim = ncoeff;
  if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
  yoffset = nperdim;
@ -136,9 +117,7 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
  peratom_flag = 1;

  nmax = 0;
-  njmax = 0;
  snad = NULL;
-
 }

 /* ---------------------------------------------------------------------- */
@ -149,9 +128,7 @@ ComputeSNADAtom::~ComputeSNADAtom()
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(cutsq);
-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }

 /* ---------------------------------------------------------------------- */
@ -178,13 +155,7 @@ void ComputeSNADAtom::init()
    if (strcmp(modify->compute[i]->style,"snad/atom") == 0) count++;
  if (count > 1 && comm->me == 0)
    error->warning(FLERR,"More than one compute snad/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }

 /* ---------------------------------------------------------------------- */
@ -235,11 +206,7 @@ void ComputeSNADAtom::compute_peratom()
  double** const x = atom->x;
  const int* const mask = atom->mask;

-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
  for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
    const int i = ilist[ii];
    if (mask[i] & groupbit) {

@ -258,7 +225,7 @@ void ComputeSNADAtom::compute_peratom()

      // insure rij, inside, and typej  are of size jnum

-      snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);

      // rij[][3] = displacements between atom I and those neighbors
      // inside = indices of neighbors of I within cutoff
@ -276,30 +243,28 @@ void ComputeSNADAtom::compute_peratom()
        const double rsq = delx*delx + dely*dely + delz*delz;
        int jtype = type[j];
        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
          ninside++;
        }
      }

-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
      if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
      }

      for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        const int j = snaptr->inside[jj];
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();

        // Accumulate -dBi/dRi, -dBi/dRj

@ -307,12 +272,12 @@ void ComputeSNADAtom::compute_peratom()
        double *snadj = snad[j]+typeoffset;

        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snadi[icoeff] += snaptr[tid]->dbvec[icoeff][0];
-          snadi[icoeff+yoffset] += snaptr[tid]->dbvec[icoeff][1];
-          snadi[icoeff+zoffset] += snaptr[tid]->dbvec[icoeff][2];
-          snadj[icoeff] -= snaptr[tid]->dbvec[icoeff][0];
-          snadj[icoeff+yoffset] -= snaptr[tid]->dbvec[icoeff][1];
-          snadj[icoeff+zoffset] -= snaptr[tid]->dbvec[icoeff][2];
+          snadi[icoeff] += snaptr->dblist[icoeff][0];
+          snadi[icoeff+yoffset] += snaptr->dblist[icoeff][1];
+          snadi[icoeff+zoffset] += snaptr->dblist[icoeff][2];
+          snadj[icoeff] -= snaptr->dblist[icoeff][0];
+          snadj[icoeff+yoffset] -= snaptr->dblist[icoeff][1];
+          snadj[icoeff+zoffset] -= snaptr->dblist[icoeff][2];
        }

        if (quadraticflag) {
@ -321,10 +286,10 @@ void ComputeSNADAtom::compute_peratom()
          snadj += quadraticoffset;
          int ncount = 0;
          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->blist[icoeff];
+            double bix = snaptr->dblist[icoeff][0];
+            double biy = snaptr->dblist[icoeff][1];
+            double biz = snaptr->dblist[icoeff][2];

            // diagonal elements of quadratic matrix

@ -343,12 +308,12 @@ void ComputeSNADAtom::compute_peratom()
            // upper-triangular elements of quadratic matrix

            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dblist[jcoeff][0]
+                + bix*snaptr->blist[jcoeff];
+              double dbytmp = bi*snaptr->dblist[jcoeff][1]
+                + biy*snaptr->blist[jcoeff];
+              double dbztmp = bi*snaptr->dblist[jcoeff][2]
+                + biz*snaptr->blist[jcoeff];

              snadi[ncount] +=         dbxtmp;
              snadi[ncount+yoffset] += dbytmp;
@ -404,10 +369,9 @@ void ComputeSNADAtom::unpack_reverse_comm(int n, int *list, double *buf)

 double ComputeSNADAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += 3*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
+
+  double bytes = nmax*size_peratom_cols * sizeof(double); // snad
+  bytes += snaptr->memory_usage();                        // SNA object
+
  return bytes;
 }
--- a/src/SNAP/compute_snad_atom.h
+++ b/src/SNAP/compute_snad_atom.h
@ -36,7 +36,7 @@ class ComputeSNADAtom : public Compute {
  double memory_usage();

 private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
  int ncoeff, nperdim, yoffset, zoffset;
  double **cutsq;
  class NeighList *list;
@ -44,10 +44,9 @@ class ComputeSNADAtom : public Compute {
  double rcutfac;
  double *radelem;
  double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
  double cutmax;
  int quadraticflag;
-  int nthreads;
 };

 }
--- a/src/SNAP/compute_snav_atom.cpp
+++ b/src/SNAP/compute_snav_atom.cpp
@ -25,7 +25,6 @@
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
-#include "openmp_snap.h"

 using namespace LAMMPS_NS;

@ -45,7 +44,6 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :

  // default values

-  diagonalstyle = 0;
  rmin0 = 0.0;
  switchflag = 1;
  bzeroflag = 1;
@ -79,14 +77,7 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
  int iarg = nargmin;

  while (iarg < narg) {
-    if (strcmp(arg[iarg],"diagonal") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      diagonalstyle = atof(arg[iarg+1]);
-      if (diagonalstyle < 0 || diagonalstyle > 3)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"rmin0") == 0) {
+    if (strcmp(arg[iarg],"rmin0") == 0) {
      if (iarg+2 > narg)
        error->all(FLERR,"Illegal compute snav/atom command");
      rmin0 = atof(arg[iarg+1]);
@ -109,20 +100,10 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
    } else error->all(FLERR,"Illegal compute snav/atom command");
  }

-  nthreads = comm->nthreads;
-  snaptr = new SNA*[nthreads];
-#if defined(_OPENMP)
-#pragma omp parallel default(none) shared(lmp,rfac0,twojmax,rmin0,switchflag,bzeroflag)
-#endif
-  {
-    int tid = omp_get_thread_num();
+  snaptr = new SNA(lmp,rfac0,twojmax,
+                   rmin0,switchflag,bzeroflag);

-    // always unset use_shared_arrays since it does not work with computes
-    snaptr[tid] = new SNA(lmp,rfac0,twojmax,diagonalstyle,
-                          0 /*use_shared_arrays*/, rmin0,switchflag,bzeroflag);
-  }
-
-  ncoeff = snaptr[0]->ncoeff;
+  ncoeff = snaptr->ncoeff;
  nperdim = ncoeff;
  if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
  size_peratom_cols = 6*nperdim*atom->ntypes;
@ -130,7 +111,6 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
  peratom_flag = 1;

  nmax = 0;
-  njmax = 0;
  snav = NULL;

 }
@ -144,9 +124,7 @@ ComputeSNAVAtom::~ComputeSNAVAtom()
  memory->destroy(wjelem);
  memory->destroy(cutsq);

-  for (int tid = 0; tid<nthreads; tid++)
-    delete snaptr[tid];
-  delete [] snaptr;
+  delete snaptr;
 }

 /* ---------------------------------------------------------------------- */
@ -174,13 +152,7 @@ void ComputeSNAVAtom::init()
    if (strcmp(modify->compute[i]->style,"snav/atom") == 0) count++;
  if (count > 1 && comm->me == 0)
    error->warning(FLERR,"More than one compute snav/atom");
-#if defined(_OPENMP)
-#pragma omp parallel default(none)
-#endif
-  {
-    int tid = omp_get_thread_num();
-    snaptr[tid]->init();
-  }
+  snaptr->init();
 }

 /* ---------------------------------------------------------------------- */
@ -230,11 +202,7 @@ void ComputeSNAVAtom::compute_peratom()
  double** const x = atom->x;
  const int* const mask = atom->mask;

-#if defined(_OPENMP)
-#pragma omp parallel for default(none)
-#endif
  for (int ii = 0; ii < inum; ii++) {
-    const int tid = omp_get_thread_num();
    const int i = ilist[ii];
    if (mask[i] & groupbit) {

@ -251,7 +219,7 @@ void ComputeSNAVAtom::compute_peratom()

      // insure rij, inside, and typej  are of size jnum

-          snaptr[tid]->grow_rij(jnum);
+      snaptr->grow_rij(jnum);

      // rij[][3] = displacements between atom I and those neighbors
      // inside = indices of neighbors of I within cutoff
@ -269,31 +237,29 @@ void ComputeSNAVAtom::compute_peratom()
        const double rsq = delx*delx + dely*dely + delz*delz;
        int jtype = type[j];
        if (rsq < cutsq[itype][jtype]&&rsq>1e-20) {
-          snaptr[tid]->rij[ninside][0] = delx;
-          snaptr[tid]->rij[ninside][1] = dely;
-          snaptr[tid]->rij[ninside][2] = delz;
-          snaptr[tid]->inside[ninside] = j;
-          snaptr[tid]->wj[ninside] = wjelem[jtype];
-          snaptr[tid]->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
+          snaptr->rij[ninside][0] = delx;
+          snaptr->rij[ninside][1] = dely;
+          snaptr->rij[ninside][2] = delz;
+          snaptr->inside[ninside] = j;
+          snaptr->wj[ninside] = wjelem[jtype];
+          snaptr->rcutij[ninside] = (radi+radelem[jtype])*rcutfac;
          ninside++;
        }
      }

-      snaptr[tid]->compute_ui(ninside);
-      snaptr[tid]->compute_zi();
+      snaptr->compute_ui(ninside);
+      snaptr->compute_zi();
      if (quadraticflag) {
-        snaptr[tid]->compute_bi();
-        snaptr[tid]->copy_bi2bvec();
+        snaptr->compute_bi();
      }

      for (int jj = 0; jj < ninside; jj++) {
-        const int j = snaptr[tid]->inside[jj];
+        const int j = snaptr->inside[jj];

-        snaptr[tid]->compute_duidrj(snaptr[tid]->rij[jj],
-                                    snaptr[tid]->wj[jj],
-                                    snaptr[tid]->rcutij[jj]);
-        snaptr[tid]->compute_dbidrj();
-        snaptr[tid]->copy_dbi2dbvec();
+        snaptr->compute_duidrj(snaptr->rij[jj],
+                                    snaptr->wj[jj],
+                                    snaptr->rcutij[jj]);
+        snaptr->compute_dbidrj();

        // Accumulate -dBi/dRi*Ri, -dBi/dRj*Rj

@ -301,18 +267,18 @@ void ComputeSNAVAtom::compute_peratom()
        double *snavj = snav[j]+typeoffset;

        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-          snavi[icoeff]           += snaptr[tid]->dbvec[icoeff][0]*xtmp;
-          snavi[icoeff+nperdim]   += snaptr[tid]->dbvec[icoeff][1]*ytmp;
-          snavi[icoeff+2*nperdim] += snaptr[tid]->dbvec[icoeff][2]*ztmp;
-          snavi[icoeff+3*nperdim] += snaptr[tid]->dbvec[icoeff][1]*ztmp;
-          snavi[icoeff+4*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ztmp;
-          snavi[icoeff+5*nperdim] += snaptr[tid]->dbvec[icoeff][0]*ytmp;
-          snavj[icoeff]           -= snaptr[tid]->dbvec[icoeff][0]*x[j][0];
-          snavj[icoeff+nperdim]   -= snaptr[tid]->dbvec[icoeff][1]*x[j][1];
-          snavj[icoeff+2*nperdim] -= snaptr[tid]->dbvec[icoeff][2]*x[j][2];
-          snavj[icoeff+3*nperdim] -= snaptr[tid]->dbvec[icoeff][1]*x[j][2];
-          snavj[icoeff+4*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][2];
-          snavj[icoeff+5*nperdim] -= snaptr[tid]->dbvec[icoeff][0]*x[j][1];
+          snavi[icoeff]           += snaptr->dblist[icoeff][0]*xtmp;
+          snavi[icoeff+nperdim]   += snaptr->dblist[icoeff][1]*ytmp;
+          snavi[icoeff+2*nperdim] += snaptr->dblist[icoeff][2]*ztmp;
+          snavi[icoeff+3*nperdim] += snaptr->dblist[icoeff][1]*ztmp;
+          snavi[icoeff+4*nperdim] += snaptr->dblist[icoeff][0]*ztmp;
+          snavi[icoeff+5*nperdim] += snaptr->dblist[icoeff][0]*ytmp;
+          snavj[icoeff]           -= snaptr->dblist[icoeff][0]*x[j][0];
+          snavj[icoeff+nperdim]   -= snaptr->dblist[icoeff][1]*x[j][1];
+          snavj[icoeff+2*nperdim] -= snaptr->dblist[icoeff][2]*x[j][2];
+          snavj[icoeff+3*nperdim] -= snaptr->dblist[icoeff][1]*x[j][2];
+          snavj[icoeff+4*nperdim] -= snaptr->dblist[icoeff][0]*x[j][2];
+          snavj[icoeff+5*nperdim] -= snaptr->dblist[icoeff][0]*x[j][1];
        }

        if (quadraticflag) {
@ -321,10 +287,10 @@ void ComputeSNAVAtom::compute_peratom()
          snavj += quadraticoffset;
          int ncount = 0;
          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
-            double bi = snaptr[tid]->bvec[icoeff];
-            double bix = snaptr[tid]->dbvec[icoeff][0];
-            double biy = snaptr[tid]->dbvec[icoeff][1];
-            double biz = snaptr[tid]->dbvec[icoeff][2];
+            double bi = snaptr->blist[icoeff];
+            double bix = snaptr->dblist[icoeff][0];
+            double biy = snaptr->dblist[icoeff][1];
+            double biz = snaptr->dblist[icoeff][2];

            // diagonal element of quadratic matrix

@ -348,12 +314,12 @@ void ComputeSNAVAtom::compute_peratom()
            // upper-triangular elements of quadratic matrix

            for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) {
-              double dbxtmp = bi*snaptr[tid]->dbvec[jcoeff][0]
-                + bix*snaptr[tid]->bvec[jcoeff];
-              double dbytmp = bi*snaptr[tid]->dbvec[jcoeff][1]
-                + biy*snaptr[tid]->bvec[jcoeff];
-              double dbztmp = bi*snaptr[tid]->dbvec[jcoeff][2]
-                + biz*snaptr[tid]->bvec[jcoeff];
+              double dbxtmp = bi*snaptr->dblist[jcoeff][0]
+                + bix*snaptr->blist[jcoeff];
+              double dbytmp = bi*snaptr->dblist[jcoeff][1]
+                + biy*snaptr->blist[jcoeff];
+              double dbztmp = bi*snaptr->dblist[jcoeff][2]
+                + biz*snaptr->blist[jcoeff];
              snavi[ncount] +=           dbxtmp*xtmp;
              snavi[ncount+nperdim] +=   dbytmp*ytmp;
              snavi[ncount+2*nperdim] += dbztmp*ztmp;
@ -414,11 +380,8 @@ void ComputeSNAVAtom::unpack_reverse_comm(int n, int *list, double *buf)

 double ComputeSNAVAtom::memory_usage()
 {
-  double bytes = nmax*size_peratom_cols * sizeof(double);
-  bytes += 3*njmax*sizeof(double);
-  bytes += njmax*sizeof(int);
-  bytes += 6*nperdim*atom->ntypes;
-  if (quadraticflag) bytes += 6*nperdim*atom->ntypes;
-  bytes += snaptr[0]->memory_usage()*comm->nthreads;
+  double bytes = nmax*size_peratom_cols * sizeof(double); // snav
+  bytes += snaptr->memory_usage();                        // SNA object
+
  return bytes;
 }
--- a/src/SNAP/compute_snav_atom.h
+++ b/src/SNAP/compute_snav_atom.h
@ -36,7 +36,7 @@ class ComputeSNAVAtom : public Compute {
  double memory_usage();

 private:
-  int nmax, njmax, diagonalstyle;
+  int nmax;
  int ncoeff, nperdim;
  double **cutsq;
  class NeighList *list;
@ -44,9 +44,8 @@ class ComputeSNAVAtom : public Compute {
  double rcutfac;
  double *radelem;
  double *wjelem;
-  class SNA** snaptr;
+  class SNA* snaptr;
  int quadraticflag;
-  int nthreads;
 };

 }
--- a/src/SNAP/openmp_snap.h
+++ b/src/SNAP/openmp_snap.h
@ -1,16 +0,0 @@
-
-#ifndef LMP_OPENMP_SNAP_H
-#define LMP_OPENMP_SNAP_H
-
-#if defined(_OPENMP)
-#include <omp.h>
-#else
-enum omp_sched_t {omp_sched_static, omp_sched_dynamic, omp_sched_guided, omp_sched_auto};
-inline int omp_get_thread_num() { return 0;}
-inline int omp_set_num_threads(int num_threads) {return 1;}
-/* inline int __sync_fetch_and_add(int* ptr, int value) {int tmp = *ptr; ptr[0]+=value; return tmp;} */
-inline void omp_set_schedule(omp_sched_t schedule,int modifier=1) {}
-inline int omp_in_parallel() {return 0;}
-#endif
-
-#endif
--- a/src/SNAP/pair_snap.cpp
+++ b/src/SNAP/pair_snap.cpp
--- a/src/SNAP/pair_snap.h
+++ b/src/SNAP/pair_snap.h
@ -29,8 +29,6 @@ public:
  PairSNAP(class LAMMPS *);
  ~PairSNAP();
  virtual void compute(int, int);
-  void compute_regular(int, int);
-  void compute_optimized(int, int);
  void settings(int, char **);
  virtual void coeff(int, char **);
  virtual void init_style();
@ -42,56 +40,14 @@ public:

 protected:
  int ncoeffq, ncoeffall;
-  double **bvec, ***dbvec;
-  class SNA** sna;
-  int nmax;
-  int nthreads;
+  class SNA* snaptr;
  virtual void allocate();
  void read_files(char *, char *);
  inline int equal(double* x,double* y);
  inline double dist2(double* x,double* y);
-  double extra_cutoff();
-  void load_balance();
-  void set_sna_to_shared(int snaid,int i);
-  void build_per_atom_arrays();

-  int schedule_user;
-  double schedule_time_guided;
-  double schedule_time_dynamic;
-
-  int ncalls_neigh;
-  int do_load_balance;
-  int ilistmask_max;
-  int* ilistmask;
-  int ghostinum;
-  int ghostilist_max;
-  int* ghostilist;
-  int ghostnumneigh_max;
-  int* ghostnumneigh;
-  int* ghostneighs;
-  int* ghostfirstneigh;
-  int ghostneighs_total;
-  int ghostneighs_max;
-
-  int use_optimized;
-  int use_shared_arrays;
-
-  int i_max;
-  int i_neighmax;
-  int i_numpairs;
-  int **i_pairs;
-  double ***i_rij;
-  int **i_inside;
-  double **i_wj;
-  double **i_rcutij;
-  int *i_ninside;
-  double ****i_uarraytot_r, ****i_uarraytot_i;
-  double ******i_zarray_r, ******i_zarray_i;
-
-#ifdef TIMING_INFO
-  //  timespec starttime, endtime;
-  double timers[4];
-#endif
+  void compute_beta();
+  void compute_bispectrum();

  double rcutmax;               // max cutoff for all elements
  int nelements;                // # of unique elements
@ -99,10 +55,13 @@ protected:
  double *radelem;              // element radii
  double *wjelem;               // elements weights
  double **coeffelem;           // element bispectrum coefficients
+  double** beta;                // betas for all atoms in list
+  double** bispectrum;          // bispectrum components for all atoms in list
  int *map;                     // mapping from atom types to elements
-  int twojmax, diagonalstyle, switchflag, bzeroflag;
+  int twojmax, switchflag, bzeroflag;
  double rfac0, rmin0, wj1, wj2;
  int rcutfacflag, twojmaxflag; // flags for required parameters
+  int beta_max;                 // length of beta
 };

 }
@ -124,15 +83,6 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.

-E: Must set number of threads via package omp command
-
-Because you are using the USER-OMP package, set the number of threads
-via its settings, not by the pair_style snap nthreads setting.
-
-W: Communication cutoff is too small for SNAP micro load balancing, increased to %lf
-
-Self-explanatory.
-
 E: Incorrect args for pair coefficients

 Self-explanatory.  Check the input script or data file.
--- a/src/SNAP/sna.cpp
+++ b/src/SNAP/sna.cpp
--- a/src/SNAP/sna.h
+++ b/src/SNAP/sna.h
@ -18,20 +18,22 @@
 #ifndef LMP_SNA_H
 #define LMP_SNA_H

-#include <complex>
 #include "pointers.h"
-#include <ctime>

 namespace LAMMPS_NS {

-struct SNA_LOOPINDICES {
+struct SNA_ZINDICES {
+  int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju;
+};
+
+struct SNA_BINDICES {
  int j1, j2, j;
 };

 class SNA : protected Pointers {

 public:
-  SNA(LAMMPS*, double, int, int, int, double, int, int);
+  SNA(LAMMPS*, double, int, double, int, int);

  SNA(LAMMPS* lmp) : Pointers(lmp) {};
  ~SNA();
@ -44,31 +46,21 @@ public:
  // functions for bispectrum coefficients

  void compute_ui(int);
-  void compute_ui_omp(int, int);
  void compute_zi();
-  void compute_zi_omp(int);
+  void compute_yi(const double*);
+  void compute_yterm(int, int, int, const double*);
  void compute_bi();
-  void copy_bi2bvec();

  // functions for derivatives

  void compute_duidrj(double*, double, double);
  void compute_dbidrj();
-  void compute_dbidrj_nonsymm();
-  void copy_dbi2dbvec();
+  void compute_deidrj(double*);
  double compute_sfac(double, double);
  double compute_dsfac(double, double);

-#ifdef TIMING_INFO
-  double* timers;
-  timespec starttime, endtime;
-  int print;
-  int counter;
-#endif
-
-  //per sna class instance for OMP use
-
-  double* bvec, ** dbvec;
+  double* blist;
+  double** dblist;
  double** rij;
  int* inside;
  double* wj;
@ -77,29 +69,32 @@ public:

  void grow_rij(int);

-  int twojmax, diagonalstyle;
-  double*** uarraytot_r, *** uarraytot_i;
-  double***** zarray_r, ***** zarray_i;
-  double*** uarraytot_r_b, *** uarraytot_i_b;
-  double***** zarray_r_b, ***** zarray_i_b;
-  double*** uarray_r, *** uarray_i;
+  int twojmax;

 private:
  double rmin0, rfac0;

-  //use indexlist instead of loops, constructor generates these
-  SNA_LOOPINDICES* idxj;
-  int idxj_max;
  // data for bispectrum coefficients

-  double***** cgarray;
+  SNA_ZINDICES* idxz;
+  SNA_BINDICES* idxb;
+  int idxcg_max, idxu_max, idxz_max, idxb_max;
+
  double** rootpqarray;
-  double*** barray;
+  double* cglist;  
+  int*** idxcg_block; 

-  // derivatives of data
+  double* ulisttot_r, * ulisttot_i;
+  double* ulist_r, * ulist_i;
+  int* idxu_block;

-  double**** duarray_r, **** duarray_i;
-  double**** dbarray;
+  double* zlist_r, * zlist_i;
+  int*** idxz_block;
+
+  int*** idxb_block;
+
+  double** dulist_r, ** dulist_i;
+  double* ylist_r, * ylist_i;

  static const int nmaxfactorial = 167;
  static const double nfac_table[];
@ -109,28 +104,16 @@ private:
  void destroy_twojmax_arrays();
  void init_clebsch_gordan();
  void init_rootpqarray();
-  void jtostr(char*, int);
-  void mtostr(char*, int, int);
-  void print_clebsch_gordan(FILE*);
  void zero_uarraytot();
  void addself_uarraytot(double);
  void add_uarraytot(double, double, double);
-  void add_uarraytot_omp(double, double, double);
  void compute_uarray(double, double, double,
                      double, double);
-  void compute_uarray_omp(double, double, double,
-                          double, double, int);
  double deltacg(int, int, int);
  int compute_ncoeff();
  void compute_duarray(double, double, double,
                       double, double, double, double, double);

-  // if number of atoms are small use per atom arrays
-  // for twojmax arrays, rij, inside, bvec
-  // this will increase the memory footprint considerably,
-  // but allows parallel filling and reuse of these arrays
-  int use_shared_arrays;
-
  // Sets the style for the switching function
  // 0 = none
  // 1 = cosine
@ -140,7 +123,7 @@ private:
  double wself;

  int bzero_flag; // 1 if bzero subtracted from barray
-  double *bzero;  // array of B values for isolated atoms
+  double* bzero;  // array of B values for isolated atoms
 };

 }
--- a/src/SPIN/atom_vec_spin.cpp
+++ b/src/SPIN/atom_vec_spin.cpp
@ -48,7 +48,7 @@ AtomVecSpin::AtomVecSpin(LAMMPS *lmp) : AtomVec(lmp)
  comm_x_only = 0;
  comm_f_only = 0;
  size_forward = 7;
-  size_reverse = 6;
+  size_reverse = 9;
  size_border = 10;
  size_velocity = 3;
  size_data_atom = 9;
@ -58,7 +58,6 @@ AtomVecSpin::AtomVecSpin(LAMMPS *lmp) : AtomVec(lmp)
  atom->sp_flag = 1;
 }

-
 /* ----------------------------------------------------------------------
   grow atom arrays
   n = 0 grows arrays by a chunk
@ -88,6 +87,7 @@ void AtomVecSpin::grow(int n)

  sp = memory->grow(atom->sp,nmax,4,"atom:sp");
  fm = memory->grow(atom->fm,nmax*comm->nthreads,3,"atom:fm");
+  fm_long = memory->grow(atom->fm_long,nmax*comm->nthreads,3,"atom:fm_long");

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -103,10 +103,9 @@ void AtomVecSpin::grow_reset()
  tag = atom->tag; type = atom->type;
  mask = atom->mask; image = atom->image;
  x = atom->x; v = atom->v; f = atom->f;
-  sp = atom->sp; fm = atom->fm;
+  sp = atom->sp; fm = atom->fm; fm_long = atom->fm_long;
 }

-
 /* ----------------------------------------------------------------------
   copy atom I info to atom J
 ------------------------------------------------------------------------- */
@ -342,6 +341,9 @@ int AtomVecSpin::pack_reverse(int n, int first, double *buf)
    buf[m++] = fm[i][0];
    buf[m++] = fm[i][1];
    buf[m++] = fm[i][2];
+    buf[m++] = fm_long[i][0];
+    buf[m++] = fm_long[i][1];
+    buf[m++] = fm_long[i][2];
  }

  return m;
@ -361,6 +363,9 @@ void AtomVecSpin::unpack_reverse(int n, int *list, double *buf)
    fm[j][0] += buf[m++];
    fm[j][1] += buf[m++];
    fm[j][2] += buf[m++];
+    fm_long[j][0] += buf[m++];
+    fm_long[j][1] += buf[m++];
+    fm_long[j][2] += buf[m++];
  }
 }

@ -939,6 +944,7 @@ bigint AtomVecSpin::memory_usage()

  if (atom->memcheck("sp")) bytes += memory->usage(sp,nmax,4);
  if (atom->memcheck("fm")) bytes += memory->usage(fm,nmax*comm->nthreads,3);
+  if (atom->memcheck("fm_long")) bytes += memory->usage(fm_long,nmax*comm->nthreads,3);

  return bytes;
 }
@ -951,6 +957,7 @@ void AtomVecSpin::force_clear(int /*n*/, size_t nbytes)
 {
  memset(&atom->f[0][0],0,3*nbytes);
  memset(&atom->fm[0][0],0,3*nbytes);
+  memset(&atom->fm_long[0][0],0,3*nbytes);
 }


--- a/src/SPIN/atom_vec_spin.h
+++ b/src/SPIN/atom_vec_spin.h
@ -68,10 +68,12 @@ class AtomVecSpin : public AtomVec {
  int *type,*mask;
  imageint *image;
  double **x,**v,**f;		// lattice quantities
-  double **sp,**fm;		// spin quantities
-  				// sp[i][0-2] direction of the spin i
+  
+  				// spin quantities
+  double **sp;			// sp[i][0-2] direction of the spin i
 				// sp[i][3] atomic magnetic moment of the spin i
-
+  double **fm;			// fm[i][0-2] direction of magnetic precession
+  double **fm_long;		// storage of long-range spin prec. components
 };

 }
--- a/src/SPIN/fix_nve_spin.cpp
+++ b/src/SPIN/fix_nve_spin.cpp
@ -129,6 +129,7 @@ FixNVESpin::FixNVESpin(LAMMPS *lmp, int narg, char **arg) :
  // initialize the magnetic interaction flags

  pair_spin_flag = 0;
+  long_spin_flag = 0;
  precession_spin_flag = 0;
  maglangevin_flag = 0;
  tdamp_flag = temp_flag = 0;
@ -213,8 +214,16 @@ void FixNVESpin::init()
  if (count != npairspin)
    error->all(FLERR,"Incorrect number of spin pairs");

+  // set pair/spin and long/spin flags
+
  if (npairspin >= 1) pair_spin_flag = 1;

+  for (int i = 0; i<npairs; i++) {
+    if (force->pair_match("spin/long",0,i)) {
+      long_spin_flag = 1;
+    }
+  }
+
  // ptrs FixPrecessionSpin classes

  int iforce;
--- a/src/SPIN/fix_nve_spin.h
+++ b/src/SPIN/fix_nve_spin.h
@ -48,7 +48,6 @@ friend class PairSpin;
  int lattice_flag; 			// lattice_flag = 0 if spins only
  					// lattice_flag = 1 if spin-lattice calc.

-
 protected:
  int sector_flag;			// sector_flag = 0  if serial algorithm
  					// sector_flag = 1  if parallel algorithm
@ -58,6 +57,7 @@ friend class PairSpin;
  int nlocal_max;			// max value of nlocal (for lists size)

  int pair_spin_flag;			// magnetic pair flags
+  int long_spin_flag;			// magnetic long-range flag
  int precession_spin_flag;		// magnetic precession flags
  int maglangevin_flag;			// magnetic langevin flags
  int tdamp_flag, temp_flag;
--- a/src/SPIN/pair_spin_dipole_cut.cpp
+++ b/src/SPIN/pair_spin_dipole_cut.cpp
@ -0,0 +1,537 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ------------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+                         Stan Moore (SNL)
+
+   Please cite the related publication:
+   Tranchida, J., Plimpton, S. J., Thibaudeau, P., & Thompson, A. P. (2018).
+   Massively parallel symplectic algorithm for coupled magnetic spin dynamics
+   and molecular dynamics. Journal of Computational Physics.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_spin_dipole_cut.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "fix_nve_spin.h"
+#include "force.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "error.h"
+#include "update.h"
+
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairSpinDipoleCut::PairSpinDipoleCut(LAMMPS *lmp) : PairSpin(lmp),
+lockfixnvespin(NULL)
+{
+  single_enable = 0;
+  spinflag = 1;
+  respa_enable = 0;
+  no_virial_fdotr_compute = 1;
+  lattice_flag = 0;
+
+  hbar = force->hplanck/MY_2PI;			// eV/(rad.THz)
+  mub = 9.274e-4;                               // in A.Ang^2
+  mu_0 = 785.15;                                // in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);     // in eV.Ang^3
+  //mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV
+  mub2mu0hbinv = mub2mu0 / hbar;		// in rad.THz
+
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairSpinDipoleCut::~PairSpinDipoleCut()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cut_spin_long);
+    memory->destroy(cutsq);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::settings(int narg, char **arg)
+{
+  if (narg < 1 || narg > 2)
+    error->all(FLERR,"Incorrect args in pair_style command");
+
+  if (strcmp(update->unit_style,"metal") != 0)
+    error->all(FLERR,"Spin simulations require metal unit style");
+
+  if (!atom->sp) 
+    error->all(FLERR,"Pair/spin style requires atom attribute sp");
+
+  cut_spin_long_global = force->numeric(FLERR,arg[0]);
+  
+  // reset cutoffs that have been explicitly set
+  
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++) {
+      for (j = i+1; j <= atom->ntypes; j++) {
+        if (setflag[i][j]) {
+          cut_spin_long[i][j] = cut_spin_long_global;
+        }
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::coeff(int narg, char **arg)
+{
+  if (!allocated) allocate();
+ 
+  if (narg != 3) 
+    error->all(FLERR,"Incorrect args in pair_style command");
+  
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  double spin_long_cut_one = force->numeric(FLERR,arg[2]);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      setflag[i][j] = 1;
+      cut_spin_long[i][j] = spin_long_cut_one;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::init_style()
+{
+  if (!atom->sp_flag)
+    error->all(FLERR,"Pair spin requires atom/spin style");
+  
+  // need a full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  
+  // checking if nve/spin or neb/spin are a listed fix
+
+  int ifix = 0;
+  while (ifix < modify->nfix) {
+    if (strcmp(modify->fix[ifix]->style,"nve/spin") == 0) break;
+    if (strcmp(modify->fix[ifix]->style,"neb/spin") == 0) break;
+    ifix++;
+  }
+  if ((ifix == modify->nfix) && (comm->me == 0))
+    error->warning(FLERR,"Using pair/spin style without nve/spin or neb/spin");
+
+  // get the lattice_flag from nve/spin
+
+  for (int i = 0; i < modify->nfix; i++) {
+    if (strcmp(modify->fix[i]->style,"nve/spin") == 0) {
+      lockfixnvespin = (FixNVESpin *) modify->fix[i];
+      lattice_flag = lockfixnvespin->lattice_flag;
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSpinDipoleCut::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  
+  cut_spin_long[j][i] = cut_spin_long[i][j];
+  
+  return cut_spin_long_global;
+}
+
+/* ----------------------------------------------------------------------
+   extract the larger cutoff if "cut" or "cut_coul"
+------------------------------------------------------------------------- */
+
+void *PairSpinDipoleCut::extract(const char *str, int &dim)
+{
+  if (strcmp(str,"cut") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"cut_coul") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"ewald_order") == 0) {
+    ewald_order = 0;
+    ewald_order |= 1<<1;
+    ewald_order |= 1<<3;
+    dim = 0;
+    return (void *) &ewald_order;
+  } else if (strcmp(str,"ewald_mix") == 0) {
+    dim = 0;
+    return (void *) &mix_flag;
+  }
+  return NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;  
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+  double rinv,r2inv,r3inv,rsq,local_cut2,evdwl,ecoul;
+  double xi[3],rij[3],eij[3],spi[4],spj[4],fi[3],fmi[3];
+
+  evdwl = ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  int *type = atom->type;  
+  int nlocal = atom->nlocal;  
+  int newton_pair = force->newton_pair;
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm = atom->fm;
+  double **sp = atom->sp;	
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // computation of the exchange interaction
+  // loop over atoms and their neighbors
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xi[0] = x[i][0];
+    xi[1] = x[i][1];
+    xi[2] = x[i][2];
+    jlist = firstneigh[i];
+    jnum = numneigh[i]; 
+    spi[0] = sp[i][0]; 
+    spi[1] = sp[i][1]; 
+    spi[2] = sp[i][2];
+    spi[3] = sp[i][3];
+    itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      evdwl = 0.0;
+      fi[0] = fi[1] = fi[2] = 0.0;
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+     
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+	r3inv = r2inv*rinv;
+	
+	compute_dipolar(i,j,eij,fmi,spi,spj,r3inv);
+	if (lattice_flag) compute_dipolar_mech(i,j,eij,fi,spi,spj,r2inv);
+      }
+
+      // force accumulation
+
+      f[i][0] += fi[0]; 
+      f[i][1] += fi[1];  	  
+      f[i][2] += fi[2];
+      fm[i][0] += fmi[0];
+      fm[i][1] += fmi[1];
+      fm[i][2] += fmi[2];
+
+      if (newton_pair || j < nlocal) {
+	f[j][0] -= fi[0];	 
+        f[j][1] -= fi[1];	  	  
+        f[j][2] -= fi[2];
+      }
+
+      if (eflag) {
+	if (rsq <= local_cut2) {
+	  evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+	  evdwl *= hbar;
+	}
+      } else evdwl = 0.0;
+
+      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+	  evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   update the pair interaction fmi acting on the spin ii
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute_single_pair(int ii, double fmi[3])
+{
+  int j,jnum,itype,jtype,ntypes; 
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+  double rsq,rinv,r2inv,r3inv,local_cut2;
+  double xi[3],rij[3],eij[3],spi[4],spj[4];
+
+  int k,locflag;
+  int *type = atom->type;  
+  double **x = atom->x;
+  double **sp = atom->sp;	
+
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // check if interaction applies to type of ii
+  
+  itype = type[ii];
+  ntypes = atom->ntypes;
+  locflag = 0;
+  k = 1;
+  while (k <= ntypes) {
+    if (k <= itype) {
+      if (setflag[k][itype] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else if (k > itype) {
+      if (setflag[itype][k] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else error->all(FLERR,"Wrong type number");
+  }
+
+
+  // if interaction applies to type ii,
+  // locflag = 1 and compute pair interaction
+ 
+  if (locflag == 1) {
+
+    xi[0] = x[ii][0];
+    xi[1] = x[ii][1];
+    xi[2] = x[ii][2];
+    spi[0] = sp[ii][0]; 
+    spi[1] = sp[ii][1]; 
+    spi[2] = sp[ii][2];
+    spi[3] = sp[ii][3];
+    jlist = firstneigh[ii];
+    jnum = numneigh[ii]; 
+    
+    for (int jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+        r3inv = r2inv*rinv;
+        
+        // compute dipolar interaction
+        
+        compute_dipolar(ii,j,eij,fmi,spi,spj,r3inv);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute dipolar interaction between spins i and j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute_dipolar(int i, int j, double eij[3], 
+    double fmi[3], double spi[4], double spj[4], double r3inv)
+{
+  double sjdotr;
+  double gigjiri3,pre;
+
+  sjdotr = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+  gigjiri3 = (spi[3] * spj[3])*r3inv;
+  pre = mub2mu0hbinv * gigjiri3;
+
+  fmi[0] += pre * (3.0 * sjdotr *eij[0] - spj[0]);
+  fmi[1] += pre * (3.0 * sjdotr *eij[1] - spj[1]);
+  fmi[2] += pre * (3.0 * sjdotr *eij[2] - spj[2]);
+}
+
+/* ----------------------------------------------------------------------
+   compute the mechanical force due to the dipolar interaction between 
+   atom i and atom j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute_dipolar_mech(int i, int j, double eij[3],
+    double fi[3], double spi[3], double spj[3], double r2inv)
+{
+  double sisj,sieij,sjeij;
+  double gigjri4,bij,pre;
+
+  gigjri4 = (spi[3] * spj[3])*r2inv*r2inv;
+  sisj = spi[0]*spj[0] + spi[1]*spj[1] + spi[2]*spj[2];
+  sieij = spi[0]*eij[0] + spi[1]*eij[1] + spi[2]*eij[2];
+  sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+  
+  bij = sisj - 5.0*sieij*sjeij;
+  pre = 3.0*mub2mu0*gigjri4;
+
+  fi[0] -= pre * (eij[0] * bij + (sjeij*spi[0] + sieij*spj[0]));
+  fi[1] -= pre * (eij[1] * bij + (sjeij*spi[1] + sieij*spj[1]));
+  fi[2] -= pre * (eij[2] * bij + (sjeij*spi[2] + sieij*spj[2]));
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cut_spin_long,n+1,n+1,"pair/spin/long:cut_spin_long");
+  memory->create(cutsq,n+1,n+1,"pair/spin/long:cutsq");
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+	fwrite(&cut_spin_long[i][j],sizeof(int),1,fp);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+	if (me == 0) {
+	  fread(&cut_spin_long[i][j],sizeof(int),1,fp);
+	}
+	MPI_Bcast(&cut_spin_long[i][j],1,MPI_INT,0,world);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::write_restart_settings(FILE *fp)
+{
+  fwrite(&cut_spin_long_global,sizeof(double),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&cut_spin_long_global,sizeof(double),1,fp);
+    fread(&mix_flag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&cut_spin_long_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
--- a/src/SPIN/pair_spin_dipole_cut.h
+++ b/src/SPIN/pair_spin_dipole_cut.h
@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(spin/dipole/cut,PairSpinDipoleCut)
+
+#else
+
+#ifndef LMP_PAIR_SPIN_DIPOLE_CUT_H
+#define LMP_PAIR_SPIN_DIPOLE_CUT_H
+
+#include "pair_spin.h"
+
+namespace LAMMPS_NS {
+
+class PairSpinDipoleCut : public PairSpin {
+ public:
+  double cut_coul;
+  double **sigma;
+
+  PairSpinDipoleCut(class LAMMPS *);
+  ~PairSpinDipoleCut();
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void init_style();
+  void *extract(const char *, int &); 
+  
+  void compute(int, int);
+  void compute_single_pair(int, double *);
+
+  void compute_dipolar(int, int, double *, double *, double *, 
+      double *, double);
+  void compute_dipolar_mech(int, int, double *, double *, double *, 
+      double *, double);
+
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  
+  double cut_spin_long_global;	// global long cutoff distance 
+
+ protected:
+  double hbar;	 		// reduced Planck's constant
+  double mub;			// Bohr's magneton
+  double mu_0;			// vacuum permeability
+  double mub2mu0;		// prefactor for mech force
+  double mub2mu0hbinv;		// prefactor for mag force
+
+  double **cut_spin_long;	// cutoff distance long
+
+  double g_ewald;
+  int ewald_order;
+
+  int lattice_flag;			// flag for mech force computation
+  class FixNVESpin *lockfixnvespin;	// ptr for setups
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args in pair_style command
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair dipole/long requires atom attributes q, mu, torque
+
+The atom style defined does not have these attributes.
+
+E: Cannot (yet) use 'electron' units with dipoles
+
+This feature is not yet supported.
+
+E: Pair style requires a KSpace style
+
+No kspace style is defined.
+
+*/
--- a/src/SPIN/pair_spin_dipole_long.cpp
+++ b/src/SPIN/pair_spin_dipole_long.cpp
@ -0,0 +1,607 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ------------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+			 Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "pair_spin_dipole_long.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "fix_nve_spin.h"
+#include "force.h"
+#include "kspace.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "error.h"
+#include "update.h"
+
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairSpinDipoleLong::PairSpinDipoleLong(LAMMPS *lmp) : PairSpin(lmp),
+lockfixnvespin(NULL)
+{
+  single_enable = 0;
+  ewaldflag = pppmflag = spinflag = 1;
+  respa_enable = 0;
+  no_virial_fdotr_compute = 1;
+  lattice_flag = 0;
+
+  hbar = force->hplanck/MY_2PI;			// eV/(rad.THz)
+  mub = 9.274e-4;                               // in A.Ang^2
+  mu_0 = 785.15;                                // in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);     // in eV.Ang^3
+  //mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV
+  mub2mu0hbinv = mub2mu0 / hbar;		// in rad.THz
+
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairSpinDipoleLong::~PairSpinDipoleLong()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cut_spin_long);
+    memory->destroy(cutsq);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::settings(int narg, char **arg)
+{
+  if (narg < 1 || narg > 2)
+    error->all(FLERR,"Incorrect args in pair_style command");
+
+  if (strcmp(update->unit_style,"metal") != 0)
+    error->all(FLERR,"Spin simulations require metal unit style");
+
+  cut_spin_long_global = force->numeric(FLERR,arg[0]);
+  
+  // reset cutoffs that have been explicitly set
+  
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++) {
+      for (j = i+1; j <= atom->ntypes; j++) {
+        if (setflag[i][j]) {
+          cut_spin_long[i][j] = cut_spin_long_global;
+        }
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::coeff(int narg, char **arg)
+{
+  if (!allocated) allocate();
+  
+  if (narg != 3)
+    error->all(FLERR,"Incorrect args in pair_style command");
+  
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  double spin_long_cut_one = force->numeric(FLERR,arg[2]);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      setflag[i][j] = 1;
+      cut_spin_long[i][j] = spin_long_cut_one;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::init_style()
+{
+  if (!atom->sp_flag)
+    error->all(FLERR,"Pair spin requires atom/spin style");
+  
+  // need a full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  
+  // checking if nve/spin or neb/spin are a listed fix
+
+  int ifix = 0;
+  while (ifix < modify->nfix) {
+    if (strcmp(modify->fix[ifix]->style,"nve/spin") == 0) break;
+    if (strcmp(modify->fix[ifix]->style,"neb/spin") == 0) break;
+    ifix++;
+  }
+  if ((ifix == modify->nfix) && (comm->me == 0))
+    error->warning(FLERR,"Using pair/spin style without nve/spin or neb/spin");
+
+  // get the lattice_flag from nve/spin
+
+  for (int i = 0; i < modify->nfix; i++) {
+    if (strcmp(modify->fix[i]->style,"nve/spin") == 0) {
+      lockfixnvespin = (FixNVESpin *) modify->fix[i];
+      lattice_flag = lockfixnvespin->lattice_flag;
+    }
+  }
+
+  // insure use of KSpace long-range solver, set g_ewald
+
+  if (force->kspace == NULL)
+    error->all(FLERR,"Pair style requires a KSpace style");
+
+  g_ewald = force->kspace->g_ewald;
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSpinDipoleLong::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  
+  cut_spin_long[j][i] = cut_spin_long[i][j];
+  
+  return cut_spin_long_global;
+}
+
+/* ----------------------------------------------------------------------
+   extract the larger cutoff if "cut" or "cut_coul"
+------------------------------------------------------------------------- */
+
+void *PairSpinDipoleLong::extract(const char *str, int &dim)
+{
+  if (strcmp(str,"cut") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"cut_coul") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"ewald_order") == 0) {
+    ewald_order = 0;
+    ewald_order |= 1<<1;
+    ewald_order |= 1<<3;
+    dim = 0;
+    return (void *) &ewald_order;
+  } else if (strcmp(str,"ewald_mix") == 0) {
+    dim = 0;
+    return (void *) &mix_flag;
+  }
+  return NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;  
+  double r,rinv,r2inv,rsq;
+  double grij,expm2,t,erfc;
+  double evdwl,ecoul;
+  double bij[4];
+  double xi[3],rij[3],eij[3];
+  double spi[4],spj[4];
+  double fi[3],fmi[3];
+  double local_cut2;
+  double pre1,pre2,pre3;
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+
+  evdwl = ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm = atom->fm;
+  double **sp = atom->sp;	
+  int *type = atom->type;  
+  int nlocal = atom->nlocal;  
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  pre1 = 2.0 * g_ewald / MY_PIS;
+  pre2 = 4.0 * pow(g_ewald,3.0) / MY_PIS;
+  pre3 = 8.0 * pow(g_ewald,5.0) / MY_PIS;
+
+  // computation of the exchange interaction
+  // loop over atoms and their neighbors
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xi[0] = x[i][0];
+    xi[1] = x[i][1];
+    xi[2] = x[i][2];
+    jlist = firstneigh[i];
+    jnum = numneigh[i]; 
+    spi[0] = sp[i][0]; 
+    spi[1] = sp[i][1]; 
+    spi[2] = sp[i][2];
+    spi[3] = sp[i][3];
+    itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      evdwl = 0.0;
+
+      fi[0] = fi[1] = fi[2] = 0.0;
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+      bij[0] = bij[1] = bij[2] = bij[3] = 0.0;
+     
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+
+        r = sqrt(rsq);
+        grij = g_ewald * r;
+        expm2 = exp(-grij*grij);
+        t = 1.0 / (1.0 + EWALD_P*grij);
+        erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+        bij[0] = erfc * rinv;
+        bij[1] = (bij[0] + pre1*expm2) * r2inv;
+        bij[2] = (3.0*bij[1] + pre2*expm2) * r2inv;
+        bij[3] = (5.0*bij[2] + pre3*expm2) * r2inv;
+
+	compute_long(i,j,eij,bij,fmi,spi,spj);
+	compute_long_mech(i,j,eij,bij,fmi,spi,spj);
+      }
+
+      // force accumulation
+
+      f[i][0] += fi[0];	 
+      f[i][1] += fi[1];	  	  
+      f[i][2] += fi[2];
+      fm[i][0] += fmi[0];	 
+      fm[i][1] += fmi[1];	  	  
+      fm[i][2] += fmi[2];
+
+      if (newton_pair || j < nlocal) {
+	f[j][0] -= fi[0];	 
+        f[j][1] -= fi[1];	  	  
+        f[j][2] -= fi[2];
+      }
+
+      if (eflag) {
+	if (rsq <= local_cut2) {
+	  evdwl -= spi[0]*fmi[0] + spi[1]*fmi[1] + 
+	    spi[2]*fmi[2];
+	  evdwl *= hbar;
+	}
+      } else evdwl = 0.0;
+
+
+      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+	  evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   update the pair interaction fmi acting on the spin ii
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute_single_pair(int ii, double fmi[3])
+{
+  //int i,j,jj,jnum,itype,jtype;  
+  int j,jj,jnum,itype,jtype,ntypes; 
+  int k,locflag;
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+  double r,rinv,r2inv,rsq,grij,expm2,t,erfc;
+  double local_cut2,pre1,pre2,pre3;
+  double bij[4],xi[3],rij[3],eij[3],spi[4],spj[4];
+
+  int *type = atom->type;  
+  double **x = atom->x;
+  double **sp = atom->sp;	
+  double **fm_long = atom->fm_long;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  
+  // check if interaction applies to type of ii
+
+  itype = type[ii];
+  ntypes = atom->ntypes;
+  locflag = 0;
+  k = 1;
+  while (k <= ntypes) {
+    if (k <= itype) {
+      if (setflag[k][itype] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else if (k > itype) {
+      if (setflag[itype][k] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else error->all(FLERR,"Wrong type number");
+  }
+
+  // if interaction applies to type ii,
+  // locflag = 1 and compute pair interaction
+
+  if (locflag == 1) {
+
+    pre1 = 2.0 * g_ewald / MY_PIS;
+    pre2 = 4.0 * pow(g_ewald,3.0) / MY_PIS;
+    pre3 = 8.0 * pow(g_ewald,5.0) / MY_PIS;
+
+    // computation of the exchange interaction
+    // loop over neighbors of atom i
+      
+    //i = ilist[ii];
+    xi[0] = x[ii][0];
+    xi[1] = x[ii][1];
+    xi[2] = x[ii][2];
+    spi[0] = sp[ii][0]; 
+    spi[1] = sp[ii][1]; 
+    spi[2] = sp[ii][2];
+    spi[3] = sp[ii][3];
+    jlist = firstneigh[ii];
+    jnum = numneigh[ii]; 
+    //itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+      bij[0] = bij[1] = bij[2] = bij[3] = 0.0;
+     
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+
+        r = sqrt(rsq);
+        grij = g_ewald * r;
+        expm2 = exp(-grij*grij);
+        t = 1.0 / (1.0 + EWALD_P*grij);
+        erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+        bij[0] = erfc * rinv;
+        bij[1] = (bij[0] + pre1*expm2) * r2inv;
+        bij[2] = (3.0*bij[1] + pre2*expm2) * r2inv;
+        bij[3] = (5.0*bij[2] + pre3*expm2) * r2inv;
+
+        compute_long(ii,j,eij,bij,fmi,spi,spj);
+      }
+    }
+
+    // adding the kspace components to fm
+    
+    fmi[0] += fm_long[ii][0];
+    fmi[1] += fm_long[ii][1];
+    fmi[2] += fm_long[ii][2];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute dipolar interaction between spins i and j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute_long(int i, int j, double eij[3], 
+    double bij[4], double fmi[3], double spi[4], double spj[4])
+{
+  double sjeij,pre;
+  double b1,b2,gigj;
+
+  gigj = spi[3] * spj[3];
+  pre = gigj*mub2mu0hbinv;
+  sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+
+  b1 = bij[1];
+  b2 = bij[2];
+
+  fmi[0] += pre * (b2 * sjeij * eij[0] - b1 * spj[0]);
+  fmi[1] += pre * (b2 * sjeij * eij[1] - b1 * spj[1]);
+  fmi[2] += pre * (b2 * sjeij * eij[2] - b1 * spj[2]);
+}
+
+/* ----------------------------------------------------------------------
+   compute the mechanical force due to the dipolar interaction between 
+   atom i and atom j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute_long_mech(int i, int j, double eij[3],
+    double bij[4], double fi[3], double spi[3], double spj[3])
+{
+  double sisj,sieij,sjeij,b2,b3;
+  double g1,g2,g1b2_g2b3,gigj,pre;
+
+  gigj = spi[3] * spj[3];
+  pre = gigj*mub2mu0;
+  sisj = spi[0]*spj[0] + spi[1]*spj[1] + spi[2]*spj[2];
+  sieij = spi[0]*eij[0] + spi[1]*eij[1] + spi[2]*eij[2];
+  sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+
+  b2 = bij[2];
+  b3 = bij[3];
+  g1 = sisj;
+  g2 = -sieij*sjeij;
+  g1b2_g2b3 = g1*b2 + g2*b3;
+
+  fi[0] += pre * (eij[0] * g1b2_g2b3 + b2 * (sjeij*spi[0] + sieij*spj[0]));
+  fi[1] += pre * (eij[1] * g1b2_g2b3 + b2 * (sjeij*spi[1] + sieij*spj[1]));
+  fi[2] += pre * (eij[2] * g1b2_g2b3 + b2 * (sjeij*spi[2] + sieij*spj[2]));
+}
+
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cut_spin_long,n+1,n+1,"pair/spin/long:cut_spin_long");
+  memory->create(cutsq,n+1,n+1,"pair/spin/long:cutsq");
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+	fwrite(&cut_spin_long[i][j],sizeof(int),1,fp);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+	if (me == 0) {
+	  fread(&cut_spin_long[i][j],sizeof(int),1,fp);
+	}
+	MPI_Bcast(&cut_spin_long[i][j],1,MPI_INT,0,world);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::write_restart_settings(FILE *fp)
+{
+  fwrite(&cut_spin_long_global,sizeof(double),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&cut_spin_long_global,sizeof(double),1,fp);
+    fread(&mix_flag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&cut_spin_long_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
--- a/src/SPIN/pair_spin_dipole_long.h
+++ b/src/SPIN/pair_spin_dipole_long.h
@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(spin/dipole/long,PairSpinDipoleLong)
+
+#else
+
+#ifndef LMP_PAIR_SPIN_DIPOLE_LONG_H
+#define LMP_PAIR_SPIN_DIPOLE_LONG_H
+
+#include "pair_spin.h"
+
+namespace LAMMPS_NS {
+
+class PairSpinDipoleLong : public PairSpin {
+ public:
+  double cut_coul;
+  double **sigma;
+
+  PairSpinDipoleLong(class LAMMPS *);
+  ~PairSpinDipoleLong();
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void init_style();
+  void *extract(const char *, int &); 
+  
+  void compute(int, int);
+  void compute_single_pair(int, double *);
+
+  void compute_long(int, int, double *, double *, double *, 
+      double *, double *);
+  void compute_long_mech(int, int, double *, double *, double *, 
+      double *, double *);
+
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  
+  double cut_spin_long_global;	// global long cutoff distance 
+
+ protected:
+  double hbar;	 		// reduced Planck's constant
+  double mub;			// Bohr's magneton
+  double mu_0;			// vacuum permeability
+  double mub2mu0;		// prefactor for mech force
+  double mub2mu0hbinv;		// prefactor for mag force
+
+  double **cut_spin_long;	// cutoff distance long
+
+  double g_ewald;
+  int ewald_order;
+
+  int lattice_flag;			// flag for mech force computation
+  class FixNVESpin *lockfixnvespin;	// ptr for setups
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args in pair_style command
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair dipole/long requires atom attributes q, mu, torque
+
+The atom style defined does not have these attributes.
+
+E: Can only use 'metal' units with spins
+
+This feature is not yet supported.
+
+E: Pair style requires a KSpace style
+
+No kspace style is defined.
+
+*/
--- a/src/SPIN/pair_spin_exchange.cpp
+++ b/src/SPIN/pair_spin_exchange.cpp
@ -75,7 +75,7 @@ PairSpinExchange::~PairSpinExchange()

 void PairSpinExchange::settings(int narg, char **arg)
 {
-  if (narg < 1 || narg > 2)
+  if (narg < 1 || narg > 7) 
    error->all(FLERR,"Incorrect number of args in pair_style pair/spin command");

  if (strcmp(update->unit_style,"metal") != 0)
@ -464,8 +464,6 @@ void PairSpinExchange::allocate()
  memory->create(cutsq,n+1,n+1,"pair:cutsq");
 }

-
-
 /* ----------------------------------------------------------------------
   proc 0 writes to restart file
 ------------------------------------------------------------------------- */
@ -550,5 +548,3 @@ void PairSpinExchange::read_restart_settings(FILE *fp)
  MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
 }
-
-
--- a/src/USER-LB/fix_lb_fluid.cpp
+++ b/src/USER-LB/fix_lb_fluid.cpp
@ -547,6 +547,18 @@ FixLbFluid::~FixLbFluid()
  } else {
    delete [] NodeArea;
  }
+  MPI_Type_free(&passxf);
+  MPI_Type_free(&passyf);
+  MPI_Type_free(&passzf);
+  MPI_Type_free(&passxu);
+  MPI_Type_free(&passyu);
+  MPI_Type_free(&passzu);
+  MPI_Type_free(&passxrho);
+  MPI_Type_free(&passyrho);
+  MPI_Type_free(&passzrho);
+  MPI_Type_free(&passxtemp);
+  MPI_Type_free(&passytemp);
+  MPI_Type_free(&passztemp);
 }

 int FixLbFluid::setmask()
--- a/src/USER-MISC/fix_bond_react.cpp
+++ b/src/USER-MISC/fix_bond_react.cpp
@ -157,6 +157,7 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :

  // this looks excessive
  // the price of vectorization (all reactions in one command)?
+  memory->create(rxn_name,nreacts,MAXLINE,"bond/react:rxn_name");
  memory->create(nevery,nreacts,"bond/react:nevery");
  memory->create(cutsq,nreacts,2,"bond/react:cutsq");
  memory->create(unreacted_mol,nreacts,"bond/react:unreacted_mol");
@ -207,8 +208,7 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :

    iarg++;

-    iarg++; // read in reaction name here
-    //for example, rxn_name[rxn] = ...
+    rxn_name[rxn] = arg[iarg++];

    int igroup = group->find(arg[iarg++]);
    if (igroup == -1) error->all(FLERR,"Could not find fix group ID");
@ -1720,8 +1720,11 @@ void FixBondReact::find_landlocked_atoms(int myrxn)
  // bad molecule templates check
  // if atoms change types, but aren't landlocked, that's bad
  for (int i = 0; i < twomol->natoms; i++) {
-    if (twomol->type[i] != onemol->type[equivalences[i][1][myrxn]-1] && landlocked_atoms[i][myrxn] == 0)
-      error->one(FLERR,"Bond/react: Atom affected by reaction too close to template edge");
+    if (twomol->type[i] != onemol->type[equivalences[i][1][myrxn]-1] && landlocked_atoms[i][myrxn] == 0) {
+      char str[128];
+      snprintf(str,128,"Bond/react: Atom affected by reaction %s too close to template edge",rxn_name[myrxn]);
+      error->all(FLERR,str);
+    }
  }

  // additionally, if a bond changes type, but neither involved atom is landlocked, bad
@ -1737,7 +1740,9 @@ void FixBondReact::find_landlocked_atoms(int myrxn)
            onemol_batom = onemol->bond_atom[onemol_atomi-1][m];
            if (onemol_batom == equivalences[twomol_atomj-1][1][myrxn]) {
              if (twomol->bond_type[i][j] != onemol->bond_type[onemol_atomi-1][m]) {
-                error->one(FLERR,"Bond/react: Bond type affected by reaction too close to template edge");
+                char str[128];
+                snprintf(str,128,"Bond/react: Atom affected by reaction %s too close to template edge",rxn_name[myrxn]);
+                error->all(FLERR,str);
              }
            }
          }
@ -1747,7 +1752,9 @@ void FixBondReact::find_landlocked_atoms(int myrxn)
              onemol_batom = onemol->bond_atom[onemol_atomj-1][m];
              if (onemol_batom == equivalences[i][1][myrxn]) {
                if (twomol->bond_type[i][j] != onemol->bond_type[onemol_atomj-1][m]) {
-                  error->one(FLERR,"Bond/react: Bond type affected by reaction too close to template edge");
+                  char str[128];
+                  snprintf(str,128,"Bond/react: Atom affected by reaction %s too close to template edge",rxn_name[myrxn]);
+                  error->all(FLERR,str);
                }
              }
            }
@ -1763,7 +1770,7 @@ void FixBondReact::find_landlocked_atoms(int myrxn)
      int ii = reverse_equiv[i][1][myrxn] - 1;
      for (int j = 0; j < twomol_nxspecial[ii][0]; j++) {
        if (delete_atoms[equivalences[twomol_xspecial[ii][j]-1][1][myrxn]-1][myrxn] == 0) {
-         error->one(FLERR,"Bond/react: A deleted atom cannot remain bonded to an atom that is not deleted");
+         error->all(FLERR,"Bond/react: A deleted atom cannot remain bonded to an atom that is not deleted");
        }
      }
    }
@ -1774,7 +1781,7 @@ void FixBondReact::find_landlocked_atoms(int myrxn)
    for (int i = 0; i < twomol->natoms; i++) {
      if (twomol_nxspecial[i][0] != onemol_nxspecial[equivalences[i][1][myrxn]-1][0] && landlocked_atoms[i][myrxn] == 0) {
        char str[128];
-        sprintf(str,"Bond/react: An atom in 'react #%d' changes bond connectivity but not atom type",myrxn+1);
+        snprintf(str,128,"Bond/react: Atom affected by reaction %s too close to template edge",rxn_name[myrxn]);
        error->warning(FLERR,str);
        break;
      }
--- a/src/USER-MISC/fix_bond_react.h
+++ b/src/USER-MISC/fix_bond_react.h
@ -67,6 +67,7 @@ class FixBondReact : public Fix {
  int *groupbits;

  int rxnID; // integer ID for identifying current bond/react
+  char **rxn_name; // name of reaction
  int *reaction_count;
  int *reaction_count_total;
  int nmax; // max num local atoms
@ -213,14 +214,14 @@ E: Bond/react: Unknown section in map file

 Please ensure reaction map files are properly formatted.

-E: Bond/react: Atom affected by reaction too close to template edge
+E or W: Bond/react: Atom affected by reaction %s too close to template edge

-This means an atom which changes type during the reaction is too close
-to an 'edge' atom defined in the superimpose file. This could cause
-incorrect assignment of bonds, angle, etc. Generally, this means you
-must include more atoms in your templates, such that there are at
-least two atoms between each atom involved in the reaction and an edge
-atom.
+This means an atom which changes type or connectivity during the
+reaction is too close to an 'edge' atom defined in the superimpose
+file. This could cause incorrect assignment of bonds, angle, etc.
+Generally, this means you must include more atoms in your templates,
+such that there are at least two atoms between each atom involved in
+the reaction and an edge atom.

 E: Bond/react: Fix bond/react needs ghost atoms from farther away

@ -233,11 +234,6 @@ E: Bond/react: A deleted atom cannot remain bonded to an atom that is not delete

 Self-explanatory.

-W: Bond/react: An atom in 'react #%d' changes bond connectivity but not atom type
-
-You may want to double-check that all atom types are properly assigned
-in the post-reaction template.
-
 E: Bond/react special bond generation overflow

 The number of special bonds per-atom created by a reaction exceeds the
--- a/src/USER-PLUMED/fix_plumed.cpp
+++ b/src/USER-PLUMED/fix_plumed.cpp
@ -34,6 +34,7 @@
 #include "modify.h"
 #include "pair.h"
 #include "utils.h"
+#include "timer.h"

 #include "plumed/wrapper/Plumed.h"

@ -406,6 +407,8 @@ void FixPlumed::post_force(int /* vflag */)

  // pass all pointers to plumed:
  p->cmd("setStep",&step);
+  int plumedStopCondition=0; 
+  p->cmd("setStopFlag",&plumedStopCondition);
  p->cmd("setPositions",&atom->x[0][0]);
  p->cmd("setBox",&box[0][0]);
  p->cmd("setForces",&atom->f[0][0]);
@ -478,6 +481,8 @@ void FixPlumed::post_force(int /* vflag */)
  // do the real calculation:
  p->cmd("performCalc");

+  if(plumedStopCondition) timer->force_timeout();
+
  // retransform virial to lammps representation and assign it to this
  // fix's virial. If the energy is biased, Plumed is giving back the full
  // virial and therefore we have to subtract the initial virial i.e. virial_lmp.
--- a/Show More
+++ b/Show More