diff --git a/doc/src/Eqs/pair_spin_dipole.jpg b/doc/src/Eqs/pair_spin_dipole.jpg
new file mode 100644
index 0000000000..e648547ec3
Binary files /dev/null and b/doc/src/Eqs/pair_spin_dipole.jpg differ
diff --git a/doc/src/Eqs/pair_spin_dipole.tex b/doc/src/Eqs/pair_spin_dipole.tex
new file mode 100644
index 0000000000..27f0bc4d2d
--- /dev/null
+++ b/doc/src/Eqs/pair_spin_dipole.tex
@@ -0,0 +1,42 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \mathcal{H}_{\rm long}= 
+    -\frac{\mu_{0} \left( \mu_B\right)^2}{4\pi} 
+    \sum_{i,j,i\neq j}^{N}
+    \frac{g_i g_j}{r_{ij}^3}
+    \Big(3 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{i}\right) 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{j}\right) 
+    -\bm{s}_i\cdot\bm{s}_j \Big)
+    \nonumber
+  \end{equation}
+  \begin{equation}
+    \bm{\omega}_i = 
+    \frac{\mu_0 (\mu_B)^2}{4\pi\hbar}\sum_{j}
+    \frac{g_i g_j}{r_{ij}^3}
+    \, \Big(
+    3\,(\bm{e}_{ij}\cdot\bm{s}_{j})\bm{e}_{ij}
+    -\bm{s}_{j} \Big) \nonumber
+  \end{equation}
+  \begin{equation}
+    \bm{F}_i =
+    \frac{3\, \mu_0 (\mu_B)^2}{4\pi} \sum_j
+    \frac{g_i g_j}{r_{ij}^4}
+    \Big[\big( (\bm{s}_i\cdot\bm{s}_j) 
+    -5(\bm{e}_{ij}\cdot\bm{s}_i)
+    (\bm{e}_{ij}\cdot\bm{s}_j)\big) \bm{e}_{ij}+
+    \big(
+    (\bm{e}_{ij}\cdot\bm{s}_i)\bm{s}_j+
+    (\bm{e}_{ij}\cdot\bm{s}_j)\bm{s}_i
+    \big)
+    \Big]
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
diff --git a/doc/src/Eqs/pair_spin_long_range.jpg b/doc/src/Eqs/pair_spin_long_range.jpg
new file mode 100644
index 0000000000..bc133d10cf
Binary files /dev/null and b/doc/src/Eqs/pair_spin_long_range.jpg differ
diff --git a/doc/src/Eqs/pair_spin_long_range.tex b/doc/src/Eqs/pair_spin_long_range.tex
new file mode 100644
index 0000000000..493879e4dd
--- /dev/null
+++ b/doc/src/Eqs/pair_spin_long_range.tex
@@ -0,0 +1,20 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \mathcal{H}_{\rm long}= 
+    -\frac{\mu_{0} \left( \mu_B\right)^2}{4\pi} 
+    \sum_{i,j,i\neq j}^{N}
+    \frac{g_i g_j}{r_{ij}^3}
+    \Big(3 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{i}\right) 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{j}\right) 
+    -\bm{s}_i\cdot\bm{s}_j \Big)
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
diff --git a/doc/src/Eqs/pair_spin_long_range_force.jpg b/doc/src/Eqs/pair_spin_long_range_force.jpg
new file mode 100644
index 0000000000..71d2acbe5a
Binary files /dev/null and b/doc/src/Eqs/pair_spin_long_range_force.jpg differ
diff --git a/doc/src/Eqs/pair_spin_long_range_force.tex b/doc/src/Eqs/pair_spin_long_range_force.tex
new file mode 100644
index 0000000000..62f8ce1374
--- /dev/null
+++ b/doc/src/Eqs/pair_spin_long_range_force.tex
@@ -0,0 +1,23 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \bm{F}_i =
+    \frac{\mu_0 (\mu_B)^2}{4\pi} \sum_j
+    \frac{g_i g_j}{r_{ij}^4}
+    \Big[\big( (\bm{s}_i\cdot\bm{s}_j) 
+    -5(\bm{e}_{ij}\cdot\bm{s}_i)
+    (\bm{e}_{ij}\cdot\bm{s}_j)\big) \bm{e}_{ij}+
+    \big(
+    (\bm{e}_{ij}\cdot\bm{s}_i)\bm{s}_j+
+    (\bm{e}_{ij}\cdot\bm{s}_j)\bm{s}_i
+    \big)
+    \Big]
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
diff --git a/doc/src/Eqs/pair_spin_long_range_magforce.jpg b/doc/src/Eqs/pair_spin_long_range_magforce.jpg
new file mode 100644
index 0000000000..d6a83a0c27
Binary files /dev/null and b/doc/src/Eqs/pair_spin_long_range_magforce.jpg differ
diff --git a/doc/src/Eqs/pair_spin_long_range_magforce.tex b/doc/src/Eqs/pair_spin_long_range_magforce.tex
new file mode 100644
index 0000000000..ad40cf9d8b
--- /dev/null
+++ b/doc/src/Eqs/pair_spin_long_range_magforce.tex
@@ -0,0 +1,17 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \bm{\omega}_i = 
+    \frac{\mu_0 (\mu_B)^2}{4\pi\hbar}\sum_{j}
+    \frac{g_i g_j}{r_{ij}^3}
+    \, \Big(
+    3\,(\bm{e}_{ij}\cdot\bm{s}_{j})\bm{e}_{ij}
+    -\bm{s}_{j} \Big) \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
diff --git a/doc/src/kspace_modify.txt b/doc/src/kspace_modify.txt
index 65b2174334..c5a2ce1b69 100644
--- a/doc/src/kspace_modify.txt
+++ b/doc/src/kspace_modify.txt
@@ -392,7 +392,8 @@ boundaries can be set using "boundary"_boundary.html (the slab
 approximation in not needed).  The {slab} keyword is not currently
 supported by Ewald or PPPM when using a triclinic simulation cell. The
 slab correction has also been extended to point dipole interactions
-"(Klapp)"_#Klapp in "kspace_style"_kspace_style.html {ewald/disp}.
+"(Klapp)"_#Klapp in "kspace_style"_kspace_style.html {ewald/disp},
+{ewald/dipole}, and {pppm/dipole}.
 
 NOTE: If you wish to apply an electric field in the Z-direction, in
 conjunction with the {slab} keyword, you should do it by adding
diff --git a/doc/src/kspace_style.txt b/doc/src/kspace_style.txt
index e1f799a6c9..98ec1e64e6 100644
--- a/doc/src/kspace_style.txt
+++ b/doc/src/kspace_style.txt
@@ -20,6 +20,10 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
     accuracy = desired relative error in forces
   {ewald/omp} value = accuracy
     accuracy = desired relative error in forces
+  {ewald/dipole} value = accuracy
+    accuracy = desired relative error in forces
+  {ewald/dipole/spin} value = accuracy
+    accuracy = desired relative error in forces
   {pppm} value = accuracy
     accuracy = desired relative error in forces
   {pppm/cg} values = accuracy (smallq)
@@ -47,6 +51,10 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
     accuracy = desired relative error in forces
   {pppm/stagger} value = accuracy
     accuracy = desired relative error in forces
+  {pppm/dipole} value = accuracy
+    accuracy = desired relative error in forces
+  {pppm/dipole/spin} value = accuracy
+    accuracy = desired relative error in forces
   {msm} value = accuracy
     accuracy = desired relative error in forces
   {msm/cg} value = accuracy (smallq)
@@ -105,9 +113,15 @@ The {ewald/disp} style adds a long-range dispersion sum option for
 but in a more efficient manner than the {ewald} style.  The 1/r^6
 capability means that Lennard-Jones or Buckingham potentials can be
 used without a cutoff, i.e. they become full long-range potentials.
-The {ewald/disp} style can also be used with point-dipoles
-"(Toukmaji)"_#Toukmaji and is currently the only kspace solver in
-LAMMPS with this capability.
+The {ewald/disp} style can also be used with point-dipoles, see
+"(Toukmaji)"_#Toukmaji.
+
+The {ewald/dipole} style adds long-range standard Ewald summations 
+for dipole-dipole interactions, see "(Toukmaji)"_#Toukmaji.
+
+The {ewald/dipole/spin} style adds long-range standard Ewald 
+summations for magnetic dipole-dipole interactions between
+magnetic spins.
 
 :line
 
@@ -128,6 +142,12 @@ The optional {smallq} argument defines the cutoff for the absolute
 charge value which determines whether a particle is considered charged
 or not.  Its default value is 1.0e-5.
 
+The {pppm/dipole} style invokes a particle-particle particle-mesh solver 
+for dipole-dipole interactions, following the method of "(Cerda)"_#Cerda2008.
+
+The {pppm/dipole/spin} style invokes a particle-particle particle-mesh solver 
+for magnetic dipole-dipole interactions between magnetic spins. 
+
 The {pppm/tip4p} style is identical to the {pppm} style except that it
 adds a charge at the massless 4th site in each TIP4P water molecule.
 It should be used with "pair styles"_pair_style.html with a
@@ -317,7 +337,10 @@ using ideas from chapter 3 of "(Hardy)"_#Hardy2006, with equation 3.197
 of particular note. When using {msm} with non-periodic boundary
 conditions, it is expected that the error estimation will be too
 pessimistic. RMS force errors for dipoles when using {ewald/disp}
-are estimated using equations 33 and 46 of "(Wang)"_#Wang.
+or {ewald/dipole} are estimated using equations 33 and 46 of
+"(Wang)"_#Wang. The RMS force errors for {pppm/dipole} are estimated
+using the equations in "(Cerda)"_#Cerda2008.
+
 
 See the "kspace_modify"_kspace_modify.html command for additional
 options of the K-space solvers that can be set, including a {force}
@@ -464,6 +487,9 @@ Illinois at Urbana-Champaign, (2006).
 :link(Sutmann2013)
 [(Sutmann)] Sutmann, Arnold, Fahrenberger, et. al., Physical review / E 88(6), 063308 (2013)
 
+:link(Cerda2008)
+[(Cerda)] Cerda, Ballenegger, Lenz, Holm, J Chem Phys 129, 234104 (2008)
+
 :link(Who2012)
 [(Who)] Who, Author2, Author3, J of Long Range Solvers, 35, 164-177
 (2012).
diff --git a/doc/src/pair_spin_dipole.txt b/doc/src/pair_spin_dipole.txt
new file mode 100644
index 0000000000..2f27f91d08
--- /dev/null
+++ b/doc/src/pair_spin_dipole.txt
@@ -0,0 +1,89 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Commands_all.html)
+
+:line
+
+pair_style spin/dipole/cut command :h3
+pair_style spin/dipole/long command :h3
+
+[Syntax:]
+
+pair_style spin/dipole/cut cutoff 
+pair_style spin/dipole/long cutoff :pre
+
+cutoff = global cutoff for magnetic dipole energy and forces
+(optional) (distance units) :ulb,l
+:ule
+
+[Examples:]
+
+pair_style spin/dipole/cut 10.0
+pair_coeff * * 10.0 
+pair_coeff 2 3 8.0 :pre
+
+pair_style spin/dipole/long 9.0
+pair_coeff * * 1.0 1.0
+pair_coeff 2 3 1.0 1.0 2.5 4.0 scale 0.5
+pair_coeff 2 3 1.0 1.0 2.5 4.0 :pre
+
+[Description:]
+
+Style {spin/dipole/cut} computes a short-range dipole-dipole
+interaction between pairs of magnetic particles that each 
+have a magnetic spin. 
+The magnetic dipole-dipole interactions are computed by the
+following formulas for the magnetic energy, magnetic precession 
+vector omega and mechanical force between particles I and J.
+
+:c,image(Eqs/pair_spin_dipole.jpg)
+
+where si and sj are the spin on two magnetic particles, 
+r is their separation distance, and the vector e = (Ri - Rj)/|Ri - Rj| 
+is the direction vector between the two particles.  
+
+Style {spin/dipole/long} computes long-range magnetic dipole-dipole
+interaction.
+A "kspace_style"_kspace_style.html must be defined to
+use this pair style.  Currently, "kspace_style 
+ewald/dipole/spin"_kspace_style.html and "kspace_style
+pppm/dipole/spin"_kspace_style.html support long-range magnetic 
+dipole-dipole interactions.
+
+:line
+
+The "pair_modify"_pair_modify.html table option is not relevant
+for this pair style.
+
+This pair style does not support the "pair_modify"_pair_modify.html
+tail option for adding long-range tail corrections to energy and
+pressure.
+
+This pair style writes its information to "binary restart
+files"_restart.html, so pair_style and pair_coeff commands do not need
+to be specified in an input script that reads a restart file.
+
+[Restrictions:]
+
+The {spin/dipole/cut} and {spin/dipole/long} styles are part of
+the SPIN package.  They are only enabled if LAMMPS was built with that 
+package.  See the "Build package"_Build_package.html doc page for more 
+info.
+
+Using dipole/spin pair styles with {electron} "units"_units.html is not
+currently supported.
+
+[Related commands:]
+
+"pair_coeff"_pair_coeff.html, "kspace_style"_kspace_style.html
+"fix nve/spin"_fix_nve_spin.html
+
+[Default:] none
+
+:line
+
+:link(Allen2)
+[(Allen)] Allen and Tildesley, Computer Simulation of Liquids,
+Clarendon Press, Oxford, 1987.
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 7b7c4d11b2..a023c5b821 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -1405,6 +1405,7 @@ Lenart
 lennard
 Lennard
 Lenosky
+Lenz
 Lett
 Leuven
 Leven
diff --git a/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp b/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
index 3ff0b1cadf..3f34838553 100644
--- a/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
+++ b/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
@@ -25,16 +25,18 @@ velocity 	all create 100 4928459 rot yes dist gaussian
 
 #pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0 spin/neel 4.0
 pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0
-pair_coeff 	* * eam/alloy Co_PurjaPun_2012.eam.alloy Co
-pair_coeff 	* * spin/exchange exchange 4.0 0.3593 1.135028015e-05 1.064568567
+pair_coeff 	* * eam/alloy ../examples/SPIN/cobalt_hcp/Co_PurjaPun_2012.eam.alloy Co
+#pair_coeff 	* * eam/alloy Co_PurjaPun_2012.eam.alloy Co
+pair_coeff 	* * spin/exchange exchange 4.0 -0.3593 1.135028015e-05 1.064568567
 #pair_coeff 	* * spin/neel neel 4.0 0.0048 0.234 1.168 2.6905 0.705 0.652  
 
 neighbor 	0.1 bin
 neigh_modify 	every 10 check yes delay 20
 
 #fix 		1 all precession/spin zeeman 1.0 0.0 0.0 1.0
-fix 		1 all precession/spin zeeman 0.0 0.0 0.0 1.0
-fix 		2 all langevin/spin 0.0 0.0 21
+fix 		1 all precession/spin anisotropy 0.01 0.0 0.0 1.0
+#fix 		2 all langevin/spin 0.0 0.0 21
+fix 		2 all langevin/spin 0.0 0.1 21
 fix 		3 all nve/spin lattice yes
 
 timestep	0.0001
diff --git a/examples/SPIN/dipole_spin/Fe_Mishin2006.eam.alloy b/examples/SPIN/dipole_spin/Fe_Mishin2006.eam.alloy
new file mode 120000
index 0000000000..a93b43f72a
--- /dev/null
+++ b/examples/SPIN/dipole_spin/Fe_Mishin2006.eam.alloy
@@ -0,0 +1 @@
+../iron/Fe_Mishin2006.eam.alloy
\ No newline at end of file
diff --git a/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_bcc_iron.dat b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_bcc_iron.dat
new file mode 100644
index 0000000000..58134f2444
--- /dev/null
+++ b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_bcc_iron.dat
@@ -0,0 +1,5 @@
+2.4824 0.01948336
+2.8665 0.01109
+4.0538 -0.0002176
+4.753 -0.001714
+4.965 -0.001986
diff --git a/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_fit.py b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_fit.py
new file mode 100644
index 0000000000..ccf84fc233
--- /dev/null
+++ b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_fit.py
@@ -0,0 +1,32 @@
+#Program fitting the exchange interaction
+#Model curve: Bethe-Slater function
+import numpy as np, pylab, tkinter
+import matplotlib.pyplot as plt
+from scipy.optimize import curve_fit
+from decimal import *
+
+print("Loop begin")
+
+#Definition of the Bethe-Slater function
+def func(x,a,b,c):
+    return 4*a*((x/c)**2)*(1-b*(x/c)**2)*np.exp(-(x/c)**2)
+
+#Exchange coeff table (data to fit)
+rdata, Jdata = np.loadtxt('exchange_bcc_iron.dat', usecols=(0,1), unpack=True)
+plt.plot(rdata, Jdata, 'b-', label='data')
+
+#Perform the fit
+popt, pcov = curve_fit(func, rdata, Jdata, bounds=(0, [500.,5.,5.]))
+plt.plot(rdata, func(rdata, *popt), 'r--', label='fit')
+
+#Print the fitted params
+print("Parameters: a={:.10} (in meV), b={:.10} (adim), c={:.10} (in Ang)".format(*popt))
+
+#Ploting the result
+plt.xlabel('r_ij')
+pylab.xlim([0,6.5])
+plt.ylabel('J_ij')
+plt.legend()
+plt.show()
+
+print("Loop end")
diff --git a/examples/SPIN/dipole_spin/fe_dd.dat b/examples/SPIN/dipole_spin/fe_dd.dat
new file mode 100644
index 0000000000..a4b4c9a0d7
--- /dev/null
+++ b/examples/SPIN/dipole_spin/fe_dd.dat
@@ -0,0 +1,19 @@
+            6            8
+ Optimal parameter set
+            1    4.100199340884814       F
+            2    1.565647547483517       F
+            1   0.9332056681088162       T    3.000000000000000     
+            2   -1.162558782567700       T    2.866666666666670     
+            3  -0.3502026949249225       T    2.733333333333330     
+            4   0.4287820835430028       T    2.600000000000000     
+            5    4.907925057809273       T    2.400000000000000     
+            6   -5.307049068415304       T    2.300000000000000     
+            1  -0.1960674387419232       F    4.100000000000000     
+            2   0.3687525935422963       F    3.800000000000000     
+            3   -1.505333614924853       F    3.500000000000000     
+            4    4.948907078156191       T    3.200000000000000     
+            5   -4.894613262753399       T    2.900000000000000     
+            6    3.468897724782442       T    2.600000000000000     
+            7   -1.792218099820337       T    2.400000000000000     
+            8    80.22069592246987       T    2.300000000000000     
+
diff --git a/examples/SPIN/dipole_spin/in.spin.iron_dipole_cut b/examples/SPIN/dipole_spin/in.spin.iron_dipole_cut
new file mode 100644
index 0000000000..a409fe0563
--- /dev/null
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_cut
@@ -0,0 +1,59 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/cut 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff      * * spin/dipole/cut 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
diff --git a/examples/SPIN/dipole_spin/in.spin.iron_dipole_ewald b/examples/SPIN/dipole_spin/in.spin.iron_dipole_ewald
new file mode 100644
index 0000000000..58b44b55fe
--- /dev/null
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_ewald
@@ -0,0 +1,61 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/long 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff	* * spin/dipole/long 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+kspace_style 	ewald/dipole/spin 1.0e-4
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
diff --git a/examples/SPIN/dipole_spin/in.spin.iron_dipole_pppm b/examples/SPIN/dipole_spin/in.spin.iron_dipole_pppm
new file mode 100644
index 0000000000..28d7e4a4bc
--- /dev/null
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_pppm
@@ -0,0 +1,62 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/long 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff	* * spin/dipole/long 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+kspace_style 	pppm/dipole/spin 1.0e-4
+kspace_modify 	compute yes
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
diff --git a/examples/SPIN/iron/in.spin.iron b/examples/SPIN/iron/in.spin.iron
index 1db5007dff..bb1b0e1b4d 100644
--- a/examples/SPIN/iron/in.spin.iron
+++ b/examples/SPIN/iron/in.spin.iron
@@ -19,7 +19,8 @@ create_atoms 	1 box
 
 mass		1 55.845
 
-set 		group all spin/random 31 2.2
+#set 		group all spin/random 31 2.2
+set 		group all spin 2.2 0.0 0.0 1.0
 velocity 	all create 100 4928459 rot yes dist gaussian
 
 pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5
diff --git a/src/.gitignore b/src/.gitignore
index 82af337f3a..c79c958e6d 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -167,6 +167,10 @@
 /pair_spin.h
 /pair_spin_dmi.cpp
 /pair_spin_dmi.h
+/pair_spin_dipole_cut.cpp
+/pair_spin_dipole_cut.h
+/pair_spin_dipole_long.cpp
+/pair_spin_dipole_long.h
 /pair_spin_exchange.cpp
 /pair_spin_exchange.h
 /pair_spin_magelec.cpp
@@ -428,6 +432,10 @@
 /ewald.h
 /ewald_cg.cpp
 /ewald_cg.h
+/ewald_dipole.cpp
+/ewald_dipole.h
+/ewald_dipole_spin.cpp
+/ewald_dipole_spin.h
 /ewald_disp.cpp
 /ewald_disp.h
 /ewald_n.cpp
@@ -1027,6 +1035,10 @@
 /pppm.h
 /pppm_cg.cpp
 /pppm_cg.h
+/pppm_dipole.cpp
+/pppm_dipole.h
+/pppm_dipole_spin.cpp
+/pppm_dipole_spin.h
 /pppm_disp.cpp
 /pppm_disp.h
 /pppm_disp_tip4p.cpp
diff --git a/src/DIPOLE/pair_lj_cut_dipole_long.cpp b/src/DIPOLE/pair_lj_cut_dipole_long.cpp
index 0806d1ee73..42446a3777 100644
--- a/src/DIPOLE/pair_lj_cut_dipole_long.cpp
+++ b/src/DIPOLE/pair_lj_cut_dipole_long.cpp
@@ -44,7 +44,7 @@ using namespace MathConst;
 PairLJCutDipoleLong::PairLJCutDipoleLong(LAMMPS *lmp) : Pair(lmp)
 {
   single_enable = 0;
-  ewaldflag = dipoleflag = 1;
+  ewaldflag = pppmflag = dipoleflag = 1;
   respa_enable = 0;
 }
 
diff --git a/src/KSPACE/ewald_dipole.cpp b/src/KSPACE/ewald_dipole.cpp
new file mode 100644
index 0000000000..89ef7e39a8
--- /dev/null
+++ b/src/KSPACE/ewald_dipole.cpp
@@ -0,0 +1,920 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL), Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "ewald_dipole.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "pair.h"
+#include "domain.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define SMALL 0.00001
+
+/* ---------------------------------------------------------------------- */
+
+EwaldDipole::EwaldDipole(LAMMPS *lmp) : Ewald(lmp),
+  tk(NULL), vc(NULL)
+{
+  ewaldflag = dipoleflag = 1;
+  group_group_enable = 0;
+  tk = NULL;
+  vc = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+EwaldDipole::~EwaldDipole()
+{
+  memory->destroy(tk);
+  memory->destroy(vc);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void EwaldDipole::init()
+{
+  if (comm->me == 0) {
+    if (screen) fprintf(screen,"EwaldDipole initialization ...\n");
+    if (logfile) fprintf(logfile,"EwaldDipole initialization ...\n");
+  }
+
+  // error check
+  
+  dipoleflag = atom->mu?1:0;
+  qsum_qsq(0); // q[i] might not be declared ?
+
+  if (dipoleflag && q2)
+    error->all(FLERR,"Cannot (yet) use charges with Kspace style EwaldDipole");
+
+  triclinic_check();
+  
+  // no triclinic ewald dipole (yet)
+  
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot (yet) use EwaldDipole with triclinic box");
+  
+  if (domain->dimension == 2)
+    error->all(FLERR,"Cannot use EwaldDipole with 2d simulation");
+
+  if (!atom->mu) error->all(FLERR,"Kspace style requires atom attribute mu");
+
+  if (dipoleflag && strcmp(update->unit_style,"electron") == 0)
+    error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
+  
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with EwaldDipole");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab EwaldDipole");
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot yet use triclinic cells with EwaldDipole");
+
+  pair_check();
+
+  int itmp;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  double cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+  // qdist = offset only for TIP4P fictitious charge
+
+  //qdist = 0.0;
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with EwaldDipole");
+
+  // compute musum & musqsum and warn if no dipole
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  musum_musq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // setup K-space resolution
+
+  bigint natoms = atom->natoms;
+
+  // use xprd,yprd,zprd even if triclinic so grid size is the same
+  // adjust z dimension for 2d slab EwaldDipole
+  // 3d EwaldDipole just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    
+    // initial guess with old method
+    
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*mu2);
+    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
+    
+    // try Newton solver
+
+    double g_ewald_new =
+      NewtonSolve(g_ewald,cutoff,natoms,xprd*yprd*zprd,mu2);
+    if (g_ewald_new > 0.0) g_ewald = g_ewald_new;
+    else error->warning(FLERR,"Ewald/disp Newton solver failed, "
+                        "using old method to estimate g_ewald");  
+  }
+
+  // setup EwaldDipole coefficients so can print stats
+
+  setup();
+
+  // final RMS accuracy
+
+  double lprx = rms(kxmax_orig,xprd,natoms,q2);
+  double lpry = rms(kymax_orig,yprd,natoms,q2);
+  double lprz = rms(kzmax_orig,zprd_slab,natoms,q2);
+  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
+  double spr = 2.0 *q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
+  double tpr = estimate_table_accuracy(q2_over_sqrt,spr);
+  double estimated_accuracy = sqrt(lpr*lpr + spr*spr + tpr*tpr);
+
+  // stats
+
+  if (comm->me == 0) {
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(screen,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(logfile,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   adjust EwaldDipole coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void EwaldDipole::setup()
+{
+  // volume-dependent factors
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+
+  // adjustment of z dimension for 2d slab EwaldDipole
+  // 3d EwaldDipole just uses zprd since slab_volfactor = 1.0
+
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  unitk[0] = 2.0*MY_PI/xprd;
+  unitk[1] = 2.0*MY_PI/yprd;
+  unitk[2] = 2.0*MY_PI/zprd_slab;
+
+  int kmax_old = kmax;
+
+  if (kewaldflag == 0) {
+
+    // determine kmax
+    // function of current box size, accuracy, G_ewald (short-range cutoff)
+
+    bigint natoms = atom->natoms;
+    double err;
+    kxmax = 1;
+    kymax = 1;
+    kzmax = 1;	
+
+    // set kmax in 3 directions to respect accuracy
+
+    err = rms_dipole(kxmax,xprd,natoms);
+    while (err > accuracy) {
+      kxmax++;
+      err = rms_dipole(kxmax,xprd,natoms);
+    }
+
+    err = rms_dipole(kymax,yprd,natoms);
+    while (err > accuracy) {
+      kymax++;
+      err = rms_dipole(kymax,yprd,natoms);
+    }
+
+    err = rms_dipole(kzmax,zprd,natoms);
+    while (err > accuracy) {
+      kzmax++;
+      err = rms_dipole(kzmax,zprd,natoms);
+    }
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+  } else {
+
+    kxmax = kx_ewald;
+    kymax = ky_ewald;
+    kzmax = kz_ewald;
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+  }
+
+  gsqmx *= 1.00001;
+
+  // if size has grown, reallocate k-dependent and nlocal-dependent arrays
+
+  if (kmax > kmax_old) {
+    deallocate();
+    allocate();
+    group_allocate_flag = 0;
+
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole:ek");
+    memory->create(tk,nmax,3,"ewald_dipole:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole:sn");
+    kmax_created = kmax;
+  }
+
+  // pre-compute EwaldDipole coefficients
+
+  coeffs();
+}
+
+/* ----------------------------------------------------------------------
+   compute dipole RMS accuracy for a dimension
+------------------------------------------------------------------------- */
+
+double EwaldDipole::rms_dipole(int km, double prd, bigint natoms)
+{
+  if (natoms == 0) natoms = 1;   // avoid division by zero
+
+  // error from eq.(46), Wang et al., JCP 115, 6351 (2001)
+  
+  double value = 8*MY_PI*mu2*g_ewald/volume *
+    sqrt(2*MY_PI*km*km*km/(15.0*natoms)) *
+    exp(-MY_PI*MY_PI*km*km/(g_ewald*g_ewald*prd*prd));
+
+  return value;
+}
+
+/* ----------------------------------------------------------------------
+   compute the EwaldDipole long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void EwaldDipole::compute(int eflag, int vflag)
+{
+  int i,j,k;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+
+  // set energy/virial flags
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    musum_musq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no charges
+
+  if (musqsum == 0.0) return;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole:ek");
+    memory->create(tk,nmax,3,"ewald_dipole:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole:sn");
+    kmax_created = kmax;
+  }
+
+  // partial structure factors on each processor
+  // total structure factor by summing over procs
+
+  eik_dot_r();
+
+  MPI_Allreduce(sfacrl,sfacrl_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(sfacim,sfacim_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+
+  // K-space portion of electric field
+  // double loop over K-vectors and local atoms
+  // perform per-atom calculations if needed
+
+  double **f = atom->f;
+  double **t = atom->torque;
+  double **mu = atom->mu;
+  int nlocal = atom->nlocal;
+
+  int kx,ky,kz;
+  double cypz,sypz,exprl,expim;
+  double partial,partial2,partial_peratom;
+  double vcik[6];
+  double mudotk;
+
+  for (i = 0; i < nlocal; i++) {
+    ek[i][0] = ek[i][1] = ek[i][2] = 0.0;
+    tk[i][0] = tk[i][1] = tk[i][2] = 0.0;
+  }
+
+  for (k = 0; k < kcount; k++) {
+    kx = kxvecs[k];
+    ky = kyvecs[k];
+    kz = kzvecs[k];
+    for (j = 0; j<6; j++) vc[k][j] = 0.0;
+
+    for (i = 0; i < nlocal; i++) {
+
+      for (j = 0; j<6; j++) vcik[j] = 0.0;
+      
+      // re-evaluating mu dot k
+      mudotk = mu[i][0]*kx*unitk[0] + mu[i][1]*ky*unitk[1] + mu[i][2]*kz*unitk[2];
+
+      // calculating  re and im of exp(i*k*ri)
+
+      cypz = cs[ky][1][i]*cs[kz][2][i] - sn[ky][1][i]*sn[kz][2][i];
+      sypz = sn[ky][1][i]*cs[kz][2][i] + cs[ky][1][i]*sn[kz][2][i];
+      exprl = cs[kx][0][i]*cypz - sn[kx][0][i]*sypz;
+      expim = sn[kx][0][i]*cypz + cs[kx][0][i]*sypz;
+
+      // taking im of struct_fact x exp(i*k*ri) (for force calc.)
+
+      partial = (mudotk)*(expim*sfacrl_all[k] - exprl*sfacim_all[k]);
+      ek[i][0] += partial * eg[k][0];
+      ek[i][1] += partial * eg[k][1];
+      ek[i][2] += partial * eg[k][2];
+
+      // compute field for torque calculation
+
+      partial_peratom = exprl*sfacrl_all[k] + expim*sfacim_all[k];
+      tk[i][0] += partial_peratom * eg[k][0];
+      tk[i][1] += partial_peratom * eg[k][1];
+      tk[i][2] += partial_peratom * eg[k][2];
+
+      // total and per-atom virial correction (dipole only)
+
+      vc[k][0] += vcik[0] = -(partial_peratom * mu[i][0] * eg[k][0]);
+      vc[k][1] += vcik[1] = -(partial_peratom * mu[i][1] * eg[k][1]);
+      vc[k][2] += vcik[2] = -(partial_peratom * mu[i][2] * eg[k][2]);
+      vc[k][3] += vcik[3] = -(partial_peratom * mu[i][0] * eg[k][1]);
+      vc[k][4] += vcik[4] = -(partial_peratom * mu[i][0] * eg[k][2]);
+      vc[k][5] += vcik[5] = -(partial_peratom * mu[i][1] * eg[k][2]);
+      
+      // taking re-part of struct_fact x exp(i*k*ri) 
+      // (for per-atom energy and virial calc.)
+
+      if (evflag_atom) {
+        if (eflag_atom) eatom[i] += mudotk*ug[k]*partial_peratom;
+        if (vflag_atom)
+          for (j = 0; j < 6; j++)
+	    vatom[i][j] += (ug[k]*mudotk*vg[k][j]*partial_peratom - vcik[j]);
+      }
+    }
+  }
+
+  // force and torque calculation
+
+  const double muscale = qqrd2e * scale;
+
+  for (i = 0; i < nlocal; i++) {
+    f[i][0] += muscale * ek[i][0];
+    f[i][1] += muscale * ek[i][1];
+    if (slabflag != 2) f[i][2] += muscale * ek[i][2];
+    t[i][0] -= muscale * (mu[i][1]*tk[i][2] - mu[i][2]*tk[i][1]);
+    t[i][1] -= muscale * (mu[i][2]*tk[i][0] - mu[i][0]*tk[i][2]);
+    if (slabflag != 2) t[i][2] -= muscale * (mu[i][0]*tk[i][1] - mu[i][1]*tk[i][0]);
+  }
+
+  // sum global energy across Kspace vevs and add in volume-dependent term
+  // taking the re-part of struct_fact_i x struct_fact_j
+  // substracting self energy and scaling
+
+  if (eflag_global) {
+    for (k = 0; k < kcount; k++) {
+      energy += ug[k] * (sfacrl_all[k]*sfacrl_all[k] +
+                         sfacim_all[k]*sfacim_all[k]);
+    }
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= muscale;
+  }
+
+  // global virial
+
+  if (vflag_global) {
+    double uk, vk;
+    for (k = 0; k < kcount; k++) {
+      uk = ug[k] * (sfacrl_all[k]*sfacrl_all[k] + sfacim_all[k]*sfacim_all[k]);
+      for (j = 0; j < 6; j++) virial[j] += uk*vg[k][j] - vc[k][j];
+    }
+    for (j = 0; j < 6; j++) virial[j] *= muscale;
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2])
+	  *2.0*g3/3.0/MY_PIS;
+        eatom[i] *= muscale;
+      }
+    }
+
+    if (vflag_atom)
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= muscale;
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   compute the struc. factors and mu dot k products
+------------------------------------------------------------------------- */
+
+void EwaldDipole::eik_dot_r()
+{
+  int i,k,l,m,n,ic;
+  double cstr1,sstr1,cstr2,sstr2,cstr3,sstr3,cstr4,sstr4;
+  double sqk,clpm,slpm;
+  double mux, muy, muz;
+  double mudotk;
+
+  double **x = atom->x;
+  double **mu = atom->mu;
+  int nlocal = atom->nlocal;
+
+  n = 0;
+  mux = muy = muz = 0.0;
+
+  // loop on different k-directions
+  // loop on n kpoints and nlocal atoms
+
+  // (k,0,0), (0,l,0), (0,0,m)
+  
+  // loop 1: k=1, l=1, m=1
+  // define first val. of cos and sin
+
+  for (ic = 0; ic < 3; ic++) {
+    sqk = unitk[ic]*unitk[ic];
+    if (sqk <= gsqmx) {
+      cstr1 = 0.0;
+      sstr1 = 0.0;
+      for (i = 0; i < nlocal; i++) {
+        cs[0][ic][i] = 1.0;
+        sn[0][ic][i] = 0.0;
+        cs[1][ic][i] = cos(unitk[ic]*x[i][ic]);
+        sn[1][ic][i] = sin(unitk[ic]*x[i][ic]);
+        cs[-1][ic][i] = cs[1][ic][i];
+        sn[-1][ic][i] = -sn[1][ic][i];
+        mudotk = (mu[i][ic]*unitk[ic]);
+        cstr1 += mudotk*cs[1][ic][i];
+        sstr1 += mudotk*sn[1][ic][i];
+      }
+      sfacrl[n] = cstr1;
+      sfacim[n++] = sstr1;
+    }
+  }
+
+  // loop 2: k>1, l>1, m>1
+
+  for (m = 2; m <= kmax; m++) {
+    for (ic = 0; ic < 3; ic++) {
+      sqk = m*unitk[ic] * m*unitk[ic];
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+          cs[m][ic][i] = cs[m-1][ic][i]*cs[1][ic][i] -
+            sn[m-1][ic][i]*sn[1][ic][i];
+          sn[m][ic][i] = sn[m-1][ic][i]*cs[1][ic][i] +
+            cs[m-1][ic][i]*sn[1][ic][i];
+          cs[-m][ic][i] = cs[m][ic][i];
+          sn[-m][ic][i] = -sn[m][ic][i];
+	  mudotk = (mu[i][ic]*m*unitk[ic]);
+          cstr1 += mudotk*cs[m][ic][i];
+          sstr1 += mudotk*sn[m][ic][i];
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+      }
+    }
+  }
+
+  // 1 = (k,l,0), 2 = (k,-l,0)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  mux = mu[i][0];
+	  muy = mu[i][1];
+
+	  // dir 1: (k,l,0)
+	  mudotk = (mux*k*unitk[0] + muy*l*unitk[1]);
+          cstr1 += mudotk*(cs[k][0][i]*cs[l][1][i]-sn[k][0][i]*sn[l][1][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[l][1][i]+cs[k][0][i]*sn[l][1][i]);
+	  
+	  // dir 2: (k,-l,0)
+	  mudotk = (mux*k*unitk[0] - muy*l*unitk[1]);
+          cstr2 += mudotk*(cs[k][0][i]*cs[l][1][i]+sn[k][0][i]*sn[l][1][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[l][1][i]-cs[k][0][i]*sn[l][1][i]);
+	}
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (0,l,m), 2 = (0,l,-m)
+
+  for (l = 1; l <= kymax; l++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (l*unitk[1] * l*unitk[1]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  muy = mu[i][1];
+	  muz = mu[i][2];
+
+	  // dir 1: (0,l,m)
+      	  mudotk = (muy*l*unitk[1] + muz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i]);
+	  
+	  // dir 2: (0,l,-m)
+	  mudotk = (muy*l*unitk[1] - muz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[l][1][i]*cs[m][2][i]+sn[l][1][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[l][1][i]*cs[m][2][i]-cs[l][1][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,0,m), 2 = (k,0,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+      	  mux = mu[i][0];
+	  muz = mu[i][2];
+
+	  // dir 1: (k,0,m)
+	  mudotk = (mux*k*unitk[0] + muz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[k][0][i]*cs[m][2][i]-sn[k][0][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[m][2][i]+cs[k][0][i]*sn[m][2][i]);
+	  
+	  // dir 2: (k,0,-m)
+	  mudotk = (mux*k*unitk[0] - muz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[k][0][i]*cs[m][2][i]+sn[k][0][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[m][2][i]-cs[k][0][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      for (m = 1; m <= kzmax; m++) {
+        sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]) +
+          (m*unitk[2] * m*unitk[2]);
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          cstr2 = 0.0;
+          sstr2 = 0.0;
+          cstr3 = 0.0;
+          sstr3 = 0.0;
+          cstr4 = 0.0;
+          sstr4 = 0.0;
+          for (i = 0; i < nlocal; i++) {
+      	    mux = mu[i][0];
+	    muy = mu[i][1];
+	    muz = mu[i][2];
+
+	    // dir 1: (k,l,m)
+	    mudotk = (mux*k*unitk[0] + muy*l*unitk[1] + muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr1 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr1 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 2: (k,-l,m)
+	    mudotk = (mux*k*unitk[0] - muy*l*unitk[1] + muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr2 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr2 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 3: (k,l,-m)
+	    mudotk = (mux*k*unitk[0] + muy*l*unitk[1] - muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr3 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr3 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 4: (k,-l,-m)
+	    mudotk = (mux*k*unitk[0] - muy*l*unitk[1] - muz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr4 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr4 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+          }
+          sfacrl[n] = cstr1;
+          sfacim[n++] = sstr1;
+          sfacrl[n] = cstr2;
+          sfacim[n++] = sstr2;
+          sfacrl[n] = cstr3;
+          sfacim[n++] = sstr3;
+          sfacrl[n] = cstr4;
+          sfacim[n++] = sstr4;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D EwaldDipole if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void EwaldDipole::slabcorr()
+{
+  // compute local contribution to global dipole moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double dipole = 0.0;
+  double **mu = atom->mu;
+  for (int i = 0; i < nlocal; i++) dipole += mu[i][2];
+
+  // sum local contributions to get global dipole moment
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  if (eflag_atom || fabs(qsum) > SMALL) {
+
+    error->all(FLERR,"Cannot (yet) use kspace slab correction with "
+      "long-range dipoles and non-neutral systems or per-atom energy");
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all/12.0)/volume;
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) energy += qscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = qscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++)
+      eatom[i] += efact * mu[i][2]*dipole_all;
+  }
+
+  // add on torque corrections
+
+  if (atom->torque) {
+    double ffact = qscale * (-4.0*MY_PI/volume);
+    double **mu = atom->mu;
+    double **torque = atom->torque;
+    for (int i = 0; i < nlocal; i++) {
+      torque[i][0] += ffact * dipole_all * mu[i][1];
+      torque[i][1] += -ffact * dipole_all * mu[i][0];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute musum,musqsum,mu2
+   called initially, when particle count changes, when dipoles are changed
+------------------------------------------------------------------------- */
+
+void EwaldDipole::musum_musq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->mu_flag) {
+    double** mu = atom->mu;
+    double musum_local(0.0), musqsum_local(0.0);
+
+    for (int i = 0; i < nlocal; i++) {
+      musum_local += mu[i][0] + mu[i][1] + mu[i][2];
+      musqsum_local += mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2];
+    }
+
+    MPI_Allreduce(&musum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&musqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    mu2 = musqsum * force->qqrd2e;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver EwaldDipole on system with no dipoles");
+}
+
+/* ----------------------------------------------------------------------
+   Newton solver used to find g_ewald for LJ systems
+------------------------------------------------------------------------- */
+
+double EwaldDipole::NewtonSolve(double x, double Rc,
+                              bigint natoms, double vol, double b2)
+{
+  double dx,tol;
+  int maxit;
+
+  maxit = 10000; //Maximum number of iterations
+  tol = 0.00001; //Convergence tolerance
+
+  //Begin algorithm
+
+  for (int i = 0; i < maxit; i++) {
+    dx = f(x,Rc,natoms,vol,b2) / derivf(x,Rc,natoms,vol,b2);
+    x = x - dx; //Update x
+    if (fabs(dx) < tol) return x;
+    if (x < 0 || x != x) // solver failed
+      return -1;
+  }
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+ Calculate f(x)
+ ------------------------------------------------------------------------- */
+
+double EwaldDipole::f(double x, double Rc, bigint natoms, double vol, double b2)
+{
+  double a = Rc*x;
+  double f = 0.0;
+
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  f = (b2/(sqrt(vol*powint(x,4)*powint(Rc,9)*natoms)) *
+    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
+    exp(-rg2)) - accuracy;
+
+  return f;
+}
+
+/* ----------------------------------------------------------------------
+ Calculate numerical derivative f'(x)
+ ------------------------------------------------------------------------- */
+
+double EwaldDipole::derivf(double x, double Rc,
+                         bigint natoms, double vol, double b2)
+{
+  double h = 0.000001;  //Derivative step-size
+  return (f(x + h,Rc,natoms,vol,b2) - f(x,Rc,natoms,vol,b2)) / h;
+}
+
diff --git a/src/KSPACE/ewald_dipole.h b/src/KSPACE/ewald_dipole.h
new file mode 100644
index 0000000000..741756487b
--- /dev/null
+++ b/src/KSPACE/ewald_dipole.h
@@ -0,0 +1,104 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(ewald/dipole,EwaldDipole)
+
+#else
+
+#ifndef LMP_EWALD_DIPOLE_H
+#define LMP_EWALD_DIPOLE_H
+
+#include "ewald.h"
+
+namespace LAMMPS_NS {
+
+class EwaldDipole : public Ewald {
+ public:
+  EwaldDipole(class LAMMPS *);
+  virtual ~EwaldDipole();
+  void init();
+  void setup();
+  virtual void compute(int, int);
+
+ protected:
+  double musum,musqsum,mu2;
+  double **tk;			// field for torque 
+  double **vc;			// virial per k
+
+  void musum_musq(); 
+  double rms_dipole(int, double, bigint);
+  virtual void eik_dot_r();
+  void slabcorr();
+  double NewtonSolve(double, double, bigint, double, double);
+  double f(double, double, bigint, double, double);
+  double derivf(double, double, bigint, double, double);
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Cannot use EwaldDipole with 2d simulation
+
+The kspace style ewald cannot be used in 2d simulations.  You can use
+2d EwaldDipole in a 3d simulation; see the kspace_modify command.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use nonperiodic boundaries with EwaldDipole
+
+For kspace style ewald, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab EwaldDipole
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with EwaldDipole.
+
+E: Cannot (yet) use EwaldDipole with triclinic box and slab correction
+
+This feature is not yet supported.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range Coulombic or dispersion components be used.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Must use 'kspace_modify gewald' for uncharged system
+
+UNDOCUMENTED
+
+E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems
+
+This option is not yet supported.
+
+*/
diff --git a/src/KSPACE/ewald_dipole_spin.cpp b/src/KSPACE/ewald_dipole_spin.cpp
new file mode 100644
index 0000000000..698203c85c
--- /dev/null
+++ b/src/KSPACE/ewald_dipole_spin.cpp
@@ -0,0 +1,857 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include <cmath>
+#include "ewald_dipole_spin.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "pair.h"
+#include "domain.h"
+#include "math_const.h"
+#include "math_special.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define SMALL 0.00001
+
+/* ---------------------------------------------------------------------- */
+
+EwaldDipoleSpin::EwaldDipoleSpin(LAMMPS *lmp) : 
+  EwaldDipole(lmp)
+{
+  dipoleflag = 0;
+  spinflag = 1;
+  
+  hbar = force->hplanck/MY_2PI;         	// eV/(rad.THz)
+  mub = 9.274e-4;                     		// in A.Ang^2
+  mu_0 = 785.15;               			// in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV.Ang^3
+  mub2mu0hbinv = mub2mu0 / hbar;        	// in rad.THz
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+EwaldDipoleSpin::~EwaldDipoleSpin() {}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::init()
+{
+  if (comm->me == 0) {
+    if (screen) fprintf(screen,"EwaldDipoleSpin initialization ...\n");
+    if (logfile) fprintf(logfile,"EwaldDipoleSpin initialization ...\n");
+  }
+
+  // error check
+  
+  spinflag = atom->sp?1:0;
+
+  triclinic_check();
+  
+  // no triclinic ewald spin (yet)
+  
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot (yet) use EwaldDipoleSpin with triclinic box");
+  
+  if (domain->dimension == 2)
+    error->all(FLERR,"Cannot use EwaldDipoleSpin with 2d simulation");
+
+  if (!atom->sp) error->all(FLERR,"Kspace style requires atom attribute sp");
+
+  if ((spinflag && strcmp(update->unit_style,"metal")) != 0)
+    error->all(FLERR,"'metal' units have to be used with spins");
+  
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with EwaldDipoleSpin");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab EwaldDipoleSpin");
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  pair_check();
+
+  int itmp;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  double cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+  // qdist = offset only for TIP4P fictitious charge
+
+  //qdist = 0.0;
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with EwaldDipoleSpin");
+
+  // compute musum & musqsum and warn if no spin
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  spsum_musq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // setup K-space resolution
+
+  bigint natoms = atom->natoms;
+
+  // use xprd,yprd,zprd even if triclinic so grid size is the same
+  // adjust z dimension for 2d slab EwaldDipoleSpin
+  // 3d EwaldDipoleSpin just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    
+    // initial guess with old method
+    
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*mu2);
+    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
+    
+    // try Newton solver
+
+    double g_ewald_new =
+      NewtonSolve(g_ewald,cutoff,natoms,xprd*yprd*zprd,mu2);
+    if (g_ewald_new > 0.0) g_ewald = g_ewald_new;
+    else error->warning(FLERR,"Ewald/disp Newton solver failed, "
+                        "using old method to estimate g_ewald");  
+  }
+
+  // setup EwaldDipoleSpin coefficients so can print stats
+
+  setup();
+
+  // final RMS accuracy
+
+  double lprx = rms(kxmax_orig,xprd,natoms,q2);
+  double lpry = rms(kymax_orig,yprd,natoms,q2);
+  double lprz = rms(kzmax_orig,zprd_slab,natoms,q2);
+  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
+  double spr = 2.0 *q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
+  double tpr = estimate_table_accuracy(q2_over_sqrt,spr);
+  double estimated_accuracy = sqrt(lpr*lpr + spr*spr + tpr*tpr);
+
+  // stats
+
+  if (comm->me == 0) {
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(screen,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  KSpace vectors: actual max1d max3d = %d %d %d\n",
+              kcount,kmax,kmax3d);
+      fprintf(logfile,"                  kxmax kymax kzmax  = %d %d %d\n",
+              kxmax,kymax,kzmax);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   adjust EwaldDipoleSpin coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::setup()
+{
+  // volume-dependent factors
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+
+  // adjustment of z dimension for 2d slab EwaldDipoleSpin
+  // 3d EwaldDipoleSpin just uses zprd since slab_volfactor = 1.0
+
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  unitk[0] = 2.0*MY_PI/xprd;
+  unitk[1] = 2.0*MY_PI/yprd;
+  unitk[2] = 2.0*MY_PI/zprd_slab;
+
+  int kmax_old = kmax;
+
+  if (kewaldflag == 0) {
+
+    // determine kmax
+    // function of current box size, accuracy, G_ewald (short-range cutoff)
+
+    bigint natoms = atom->natoms;
+    double err;
+    kxmax = 1;
+    kymax = 1;
+    kzmax = 1;	
+
+    // set kmax in 3 directions to respect accuracy
+
+    err = rms_dipole(kxmax,xprd,natoms);
+    while (err > accuracy) {
+      kxmax++;
+      err = rms_dipole(kxmax,xprd,natoms);
+    }
+
+    err = rms_dipole(kymax,yprd,natoms);
+    while (err > accuracy) {
+      kymax++;
+      err = rms_dipole(kymax,yprd,natoms);
+    }
+
+    err = rms_dipole(kzmax,zprd,natoms);
+    while (err > accuracy) {
+      kzmax++;
+      err = rms_dipole(kzmax,zprd,natoms);
+    }
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+  } else {
+
+    kxmax = kx_ewald;
+    kymax = ky_ewald;
+    kzmax = kz_ewald;
+
+    kxmax_orig = kxmax;
+    kymax_orig = kymax;
+    kzmax_orig = kzmax;
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+
+    double gsqxmx = unitk[0]*unitk[0]*kxmax*kxmax;
+    double gsqymx = unitk[1]*unitk[1]*kymax*kymax;
+    double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
+    gsqmx = MAX(gsqxmx,gsqymx);
+    gsqmx = MAX(gsqmx,gsqzmx);
+  }
+
+  gsqmx *= 1.00001;
+
+  // if size has grown, reallocate k-dependent and nlocal-dependent arrays
+
+  if (kmax > kmax_old) {
+    deallocate();
+    allocate();
+    group_allocate_flag = 0;
+
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole_spin:ek");
+    memory->create(tk,nmax,3,"ewald_dipole_spin:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole_spin:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole_spin:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole_spin:sn");
+    kmax_created = kmax;
+  }
+
+  // pre-compute EwaldDipoleSpin coefficients
+
+  coeffs();
+}
+
+/* ----------------------------------------------------------------------
+   compute the EwaldDipoleSpin long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::compute(int eflag, int vflag)
+{
+  int i,j,k;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+  double spx, spy, spz;
+
+  // set energy/virial flags
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    spsum_musq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no charges
+
+  if (musqsum == 0.0) return;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(ek);
+    memory->destroy(tk);
+    memory->destroy(vc);
+    memory->destroy3d_offset(cs,-kmax_created);
+    memory->destroy3d_offset(sn,-kmax_created);
+    nmax = atom->nmax;
+    memory->create(ek,nmax,3,"ewald_dipole_spin:ek");
+    memory->create(tk,nmax,3,"ewald_dipole_spin:tk");
+    memory->create(vc,kmax3d,6,"ewald_dipole_spin:tk");
+    memory->create3d_offset(cs,-kmax,kmax,3,nmax,"ewald_dipole_spin:cs");
+    memory->create3d_offset(sn,-kmax,kmax,3,nmax,"ewald_dipole_spin:sn");
+    kmax_created = kmax;
+  }
+
+  // partial structure factors on each processor
+  // total structure factor by summing over procs
+
+  eik_dot_r();
+
+  MPI_Allreduce(sfacrl,sfacrl_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(sfacim,sfacim_all,kcount,MPI_DOUBLE,MPI_SUM,world);
+
+  // K-space portion of electric field
+  // double loop over K-vectors and local atoms
+  // perform per-atom calculations if needed
+
+  double **f = atom->f;
+  double **fm_long = atom->fm_long;
+  double **sp = atom->sp;
+  int nlocal = atom->nlocal;
+
+  int kx,ky,kz;
+  double cypz,sypz,exprl,expim;
+  double partial,partial2,partial_peratom;
+  double vcik[6];
+  double mudotk;
+
+  for (i = 0; i < nlocal; i++) {
+    ek[i][0] = ek[i][1] = ek[i][2] = 0.0;
+    tk[i][0] = tk[i][1] = tk[i][2] = 0.0;
+  }
+
+  for (k = 0; k < kcount; k++) {
+    kx = kxvecs[k];
+    ky = kyvecs[k];
+    kz = kzvecs[k];
+    for (j = 0; j<6; j++) vc[k][j] = 0.0;
+
+    for (i = 0; i < nlocal; i++) {
+
+      for (j = 0; j<6; j++) vcik[j] = 0.0;
+
+      // re-evaluating sp dot k
+      
+      spx = sp[i][0]*sp[i][3];
+      spy = sp[i][1]*sp[i][3];
+      spz = sp[i][2]*sp[i][3];
+      mudotk = spx*kx*unitk[0] + spy*ky*unitk[1] + spz*kz*unitk[2];
+
+      // calculating  re and im of exp(i*k*ri)
+
+      cypz = cs[ky][1][i]*cs[kz][2][i] - sn[ky][1][i]*sn[kz][2][i];
+      sypz = sn[ky][1][i]*cs[kz][2][i] + cs[ky][1][i]*sn[kz][2][i];
+      exprl = cs[kx][0][i]*cypz - sn[kx][0][i]*sypz;
+      expim = sn[kx][0][i]*cypz + cs[kx][0][i]*sypz;
+
+      // taking im of struct_fact x exp(i*k*ri) (for force calc.)
+
+      partial = mudotk*(expim*sfacrl_all[k] - exprl*sfacim_all[k]);
+      ek[i][0] += partial * eg[k][0];
+      ek[i][1] += partial * eg[k][1];
+      ek[i][2] += partial * eg[k][2];
+
+      // compute field for torque calculation
+
+      partial_peratom = exprl*sfacrl_all[k] + expim*sfacim_all[k];
+      tk[i][0] += partial2*eg[k][0];
+      tk[i][1] += partial2*eg[k][1];
+      tk[i][2] += partial2*eg[k][2];
+
+      // total and per-atom virial correction
+      
+      vc[k][0] += vcik[0] = -(partial_peratom * spx * eg[k][0]);
+      vc[k][1] += vcik[1] = -(partial_peratom * spy * eg[k][1]);
+      vc[k][2] += vcik[2] = -(partial_peratom * spz * eg[k][2]);
+      vc[k][3] += vcik[3] = -(partial_peratom * spx * eg[k][1]);
+      vc[k][4] += vcik[4] = -(partial_peratom * spx * eg[k][2]);
+      vc[k][5] += vcik[5] = -(partial_peratom * spy * eg[k][2]);
+      
+      // taking re-part of struct_fact x exp(i*k*ri) 
+      // (for per-atom energy and virial calc.)
+
+      if (evflag_atom) {
+        if (eflag_atom) eatom[i] += mudotk*ug[k]*partial_peratom;
+        if (vflag_atom)
+          for (j = 0; j < 6; j++)
+	    vatom[i][j] += (ug[k]*mudotk*vg[k][j]*partial_peratom - vcik[j]);
+      }
+    }
+  }
+
+  // force and mag. precession vectors calculation
+  
+  const double spscale = mub2mu0 * scale;
+  const double spscale2 = mub2mu0hbinv * scale;
+
+  for (i = 0; i < nlocal; i++) {
+    f[i][0] += spscale * ek[i][0];
+    f[i][1] += spscale * ek[i][1];
+    if (slabflag != 2) f[i][2] += spscale * ek[i][2];
+    fm_long[i][0] += spscale2 * tk[i][0];
+    fm_long[i][1] += spscale2 * tk[i][1];
+    if (slabflag != 2) fm_long[i][2] += spscale2 * tk[i][3];
+  }
+  
+  // sum global energy across Kspace vevs and add in volume-dependent term
+  // taking the re-part of struct_fact_i x struct_fact_j
+  // substracting self energy and scaling
+
+  if (eflag_global) {
+    for (k = 0; k < kcount; k++) {
+      energy += ug[k] * (sfacrl_all[k]*sfacrl_all[k] +
+                         sfacim_all[k]*sfacim_all[k]);
+    }
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= spscale;
+  }
+
+  // global virial
+
+  if (vflag_global) {
+    double uk, vk;
+    for (k = 0; k < kcount; k++) {
+      uk = ug[k] * (sfacrl_all[k]*sfacrl_all[k] + sfacim_all[k]*sfacim_all[k]);
+      for (j = 0; j < 6; j++) virial[j] += uk*vg[k][j] - vc[k][j];
+    }
+    for (j = 0; j < 6; j++) virial[j] *= spscale;
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+	spx = sp[i][0]*sp[i][3];
+	spy = sp[i][1]*sp[i][3];
+	spz = sp[i][2]*sp[i][3];
+        eatom[i] -= (spx*spx + spy*spy + spz*spz)
+	  *2.0*g3/3.0/MY_PIS;
+        eatom[i] *= spscale;
+      }
+    }
+
+    if (vflag_atom)
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= spscale;
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   compute the struc. factors and mu dot k products
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::eik_dot_r()
+{
+  int i,k,l,m,n,ic;
+  double cstr1,sstr1,cstr2,sstr2,cstr3,sstr3,cstr4,sstr4;
+  double sqk,clpm,slpm;
+  double spx, spy, spz, spi;
+  double mudotk;
+
+  double **x = atom->x;
+  double **sp = atom->sp;
+  int nlocal = atom->nlocal;
+
+  n = 0;
+  spi = spx = spy = spz = 0.0;
+
+  // loop on different k-directions
+  // loop on n kpoints and nlocal atoms
+  // store (n x nlocal) tab. of values of (mu_i dot k)
+  // store n values of sum_j[ (mu_j dot k) exp(-k dot r_j) ]
+
+  // (k,0,0), (0,l,0), (0,0,m)
+  
+  // loop 1: k=1, l=1, m=1
+  // define first val. of cos and sin
+
+  for (ic = 0; ic < 3; ic++) {
+    sqk = unitk[ic]*unitk[ic];
+    if (sqk <= gsqmx) {
+      cstr1 = 0.0;
+      sstr1 = 0.0;
+      for (i = 0; i < nlocal; i++) {
+        cs[0][ic][i] = 1.0;
+        sn[0][ic][i] = 0.0;
+        cs[1][ic][i] = cos(unitk[ic]*x[i][ic]);
+        sn[1][ic][i] = sin(unitk[ic]*x[i][ic]);
+        cs[-1][ic][i] = cs[1][ic][i];
+        sn[-1][ic][i] = -sn[1][ic][i];
+	spi = sp[i][ic]*sp[i][3];
+        mudotk = (spi*unitk[ic]);
+        cstr1 += mudotk*cs[1][ic][i];
+        sstr1 += mudotk*sn[1][ic][i];
+      }
+      sfacrl[n] = cstr1;
+      sfacim[n++] = sstr1;
+    }
+  }
+
+  // loop 2: k>1, l>1, m>1
+
+  for (m = 2; m <= kmax; m++) {
+    for (ic = 0; ic < 3; ic++) {
+      sqk = m*unitk[ic] * m*unitk[ic];
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+          cs[m][ic][i] = cs[m-1][ic][i]*cs[1][ic][i] -
+            sn[m-1][ic][i]*sn[1][ic][i];
+          sn[m][ic][i] = sn[m-1][ic][i]*cs[1][ic][i] +
+            cs[m-1][ic][i]*sn[1][ic][i];
+          cs[-m][ic][i] = cs[m][ic][i];
+          sn[-m][ic][i] = -sn[m][ic][i];
+	  spi = sp[i][ic]*sp[i][3];
+	  mudotk = (spi*m*unitk[ic]);
+          cstr1 += mudotk*cs[m][ic][i];
+          sstr1 += mudotk*sn[m][ic][i];
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+      }
+    }
+  }
+
+  // 1 = (k,l,0), 2 = (k,-l,0)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  spx = sp[i][0]*sp[i][3];
+	  spy = sp[i][1]*sp[i][3];
+
+	  // dir 1: (k,l,0)
+	  mudotk = (spx*k*unitk[0] + spy*l*unitk[1]);
+          cstr1 += mudotk*(cs[k][0][i]*cs[l][1][i]-sn[k][0][i]*sn[l][1][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[l][1][i]+cs[k][0][i]*sn[l][1][i]);
+	  
+	  // dir 2: (k,-l,0)
+	  mudotk = (spx*k*unitk[0] - spy*l*unitk[1]);
+          cstr2 += mudotk*(cs[k][0][i]*cs[l][1][i]+sn[k][0][i]*sn[l][1][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[l][1][i]-cs[k][0][i]*sn[l][1][i]);
+	}
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (0,l,m), 2 = (0,l,-m)
+
+  for (l = 1; l <= kymax; l++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (l*unitk[1] * l*unitk[1]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+	  spy = sp[i][1]*sp[i][3];
+	  spz = sp[i][2]*sp[i][3];
+
+	  // dir 1: (0,l,m)
+      	  mudotk = (spy*l*unitk[1] + spz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i]);
+	  
+	  // dir 2: (0,l,-m)
+	  mudotk = (spy*l*unitk[1] - spz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[l][1][i]*cs[m][2][i]+sn[l][1][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[l][1][i]*cs[m][2][i]-cs[l][1][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,0,m), 2 = (k,0,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (m = 1; m <= kzmax; m++) {
+      sqk = (k*unitk[0] * k*unitk[0]) + (m*unitk[2] * m*unitk[2]);
+      if (sqk <= gsqmx) {
+        cstr1 = 0.0;
+        sstr1 = 0.0;
+        cstr2 = 0.0;
+        sstr2 = 0.0;
+        for (i = 0; i < nlocal; i++) {
+      	  spx = sp[i][0]*sp[i][3];
+	  spz = sp[i][2]*sp[i][3];
+
+	  // dir 1: (k,0,m)
+	  mudotk = (spx*k*unitk[0] + spz*m*unitk[2]); 
+          cstr1 += mudotk*(cs[k][0][i]*cs[m][2][i]-sn[k][0][i]*sn[m][2][i]);
+          sstr1 += mudotk*(sn[k][0][i]*cs[m][2][i]+cs[k][0][i]*sn[m][2][i]);
+	  
+	  // dir 2: (k,0,-m)
+	  mudotk = (spx*k*unitk[0] - spz*m*unitk[2]); 
+          cstr2 += mudotk*(cs[k][0][i]*cs[m][2][i]+sn[k][0][i]*sn[m][2][i]);
+          sstr2 += mudotk*(sn[k][0][i]*cs[m][2][i]-cs[k][0][i]*sn[m][2][i]);
+        }
+        sfacrl[n] = cstr1;
+        sfacim[n++] = sstr1;
+        sfacrl[n] = cstr2;
+        sfacim[n++] = sstr2;
+      }
+    }
+  }
+
+  // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = 1; l <= kymax; l++) {
+      for (m = 1; m <= kzmax; m++) {
+        sqk = (k*unitk[0] * k*unitk[0]) + (l*unitk[1] * l*unitk[1]) +
+          (m*unitk[2] * m*unitk[2]);
+        if (sqk <= gsqmx) {
+          cstr1 = 0.0;
+          sstr1 = 0.0;
+          cstr2 = 0.0;
+          sstr2 = 0.0;
+          cstr3 = 0.0;
+          sstr3 = 0.0;
+          cstr4 = 0.0;
+          sstr4 = 0.0;
+          for (i = 0; i < nlocal; i++) {
+      	    spx = sp[i][0]*sp[i][3];
+	    spy = sp[i][1]*sp[i][3];
+	    spz = sp[i][2]*sp[i][3];
+
+	    // dir 1: (k,l,m)
+	    mudotk = (spx*k*unitk[0] + spy*l*unitk[1] + spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr1 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr1 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 2: (k,-l,m)
+	    mudotk = (spx*k*unitk[0] - spy*l*unitk[1] + spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+            cstr2 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr2 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 3: (k,l,-m)
+	    mudotk = (spx*k*unitk[0] + spy*l*unitk[1] - spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] + sn[l][1][i]*sn[m][2][i];
+            slpm = sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr3 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr3 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+
+	    // dir 4: (k,-l,-m)
+	    mudotk = (spx*k*unitk[0] - spy*l*unitk[1] - spz*m*unitk[2]); 
+            clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+            slpm = -sn[l][1][i]*cs[m][2][i] - cs[l][1][i]*sn[m][2][i];
+            cstr4 += mudotk*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+            sstr4 += mudotk*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+          }
+          sfacrl[n] = cstr1;
+          sfacim[n++] = sstr1;
+          sfacrl[n] = cstr2;
+          sfacim[n++] = sstr2;
+          sfacrl[n] = cstr3;
+          sfacim[n++] = sstr3;
+          sfacrl[n] = cstr4;
+          sfacim[n++] = sstr4;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D EwaldDipoleSpin if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::slabcorr()
+{
+  // compute local contribution to global dipole/spin moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double spin = 0.0;
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  for (int i = 0; i < nlocal; i++) { 
+    spz = sp[i][2]*sp[i][3];
+    spin += spz;
+  }
+
+  // sum local contributions to get global spin moment
+
+  double spin_all;
+  MPI_Allreduce(&spin,&spin_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  if (eflag_atom || fabs(qsum) > SMALL) {
+
+    error->all(FLERR,"Cannot (yet) use kspace slab correction with "
+      "long-range spins and non-neutral systems or per-atom energy");
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(spin_all*spin_all/12.0)/volume;
+  const double spscale = mub2mu0 * scale;
+
+  if (eflag_global) energy += spscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = spscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++) {
+      spz = sp[i][2]*sp[i][3];
+      eatom[i] += efact * spz * spin_all;
+    }
+  }
+
+  // add on mag. force corrections
+
+  double ffact = spscale * (-4.0*MY_PI/volume);
+  double **fm_long = atom->fm_long;
+  for (int i = 0; i < nlocal; i++) {
+    fm_long[i][2] += ffact * spin_all;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute musum,musqsum,mu2 for magnetic spins
+   called initially, when particle count changes, when spins are changed
+------------------------------------------------------------------------- */
+
+void EwaldDipoleSpin::spsum_musq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->sp_flag) {
+    double** sp = atom->sp;
+    double spx,spy,spz;
+    double musum_local(0.0), musqsum_local(0.0);
+
+    for (int i = 0; i < nlocal; i++) {
+      spx = sp[i][0]*sp[i][3];
+      spy = sp[i][1]*sp[i][3];
+      spz = sp[i][2]*sp[i][3];
+      musum_local += spx + spy + spz;
+      musqsum_local += spx*spx + spy*spy + spz*spz;
+    }
+
+    MPI_Allreduce(&musum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&musqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    //mu2 = musqsum * mub2mu0;
+    mu2 = musqsum;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver EwaldDipoleSpin on system with no spins");
+}
diff --git a/src/KSPACE/ewald_dipole_spin.h b/src/KSPACE/ewald_dipole_spin.h
new file mode 100644
index 0000000000..20852c08c1
--- /dev/null
+++ b/src/KSPACE/ewald_dipole_spin.h
@@ -0,0 +1,102 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(ewald/dipole/spin,EwaldDipoleSpin)
+
+#else
+
+#ifndef LMP_EWALD_DIPOLE_SPIN_H
+#define LMP_EWALD_DIPOLE_SPIN_H
+
+#include "ewald_dipole.h"
+
+namespace LAMMPS_NS {
+
+class EwaldDipoleSpin : public EwaldDipole {
+ public:
+  EwaldDipoleSpin(class LAMMPS *);
+  virtual ~EwaldDipoleSpin();
+  void init();
+  void setup();
+  void compute(int, int);
+
+ protected:
+  double hbar;                  // reduced Planck's constant      
+  double mub;                   // Bohr's magneton                
+  double mu_0;                  // vacuum permeability
+  double mub2mu0;               // prefactor for mech force
+  double mub2mu0hbinv;          // prefactor for mag force
+
+  void spsum_musq(); 
+  virtual void eik_dot_r();
+  void slabcorr();
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Cannot use EwaldDipoleSpin with 2d simulation
+
+The kspace style ewald cannot be used in 2d simulations.  You can use
+2d EwaldDipoleSpin in a 3d simulation; see the kspace_modify command.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use nonperiodic boundaries with EwaldDipoleSpin
+
+For kspace style ewald, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab EwaldDipoleSpin
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with EwaldDipoleSpin.
+
+E: Cannot (yet) use EwaldDipoleSpin with triclinic box and slab correction
+
+This feature is not yet supported.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range Coulombic or dispersion components be used.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Must use 'kspace_modify gewald' for uncharged system
+
+UNDOCUMENTED
+
+E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems
+
+This option is not yet supported.
+
+*/
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 773305bb5e..53c18804a5 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -1430,12 +1430,13 @@ void PPPM::set_grid_local()
   double zprd = prd[2];
   double zprd_slab = zprd*slab_volfactor;
 
-  double dist[3];
+  double dist[3] = {0.0,0.0,0.0};
   double cuthalf = 0.5*neighbor->skin + qdist;
   if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
   else kspacebbox(cuthalf,&dist[0]);
 
   int nlo,nhi;
+  nlo = nhi = 0;
 
   nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
                             nx_pppm/xprd + shift) - OFFSET;
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index c9190d9fca..1ce1a0d666 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -42,7 +42,7 @@ class PPPM : public KSpace {
   virtual void settings(int, char **);
   virtual void init();
   virtual void setup();
-  void setup_grid();
+  virtual void setup_grid();
   virtual void compute(int, int);
   virtual int timing_1d(int, double &);
   virtual int timing_3d(int, double &);
@@ -106,10 +106,10 @@ class PPPM : public KSpace {
   double qdist;                // distance from O site to negative charge
   double alpha;                // geometric factor
 
-  void set_grid_global();
+  virtual void set_grid_global();
   void set_grid_local();
   void adjust_gewald();
-  double newton_raphson_f();
+  virtual double newton_raphson_f();
   double derivf();
   double final_accuracy();
 
@@ -146,7 +146,7 @@ class PPPM : public KSpace {
   void compute_drho1d(const FFT_SCALAR &, const FFT_SCALAR &,
                      const FFT_SCALAR &);
   void compute_rho_coeff();
-  void slabcorr();
+  virtual void slabcorr();
 
   // grid communication
 
diff --git a/src/KSPACE/pppm_dipole.cpp b/src/KSPACE/pppm_dipole.cpp
new file mode 100644
index 0000000000..21a777dd75
--- /dev/null
+++ b/src/KSPACE/pppm_dipole.cpp
@@ -0,0 +1,2568 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stan Moore (SNL), Julien Tranchida (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include "pppm_dipole.h"
+#include "atom.h"
+#include "comm.h"
+#include "gridcomm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define LARGE 10000.0
+#define SMALL 0.00001
+#define EPS_HOC 1.0e-7
+
+enum{REVERSE_MU};
+enum{FORWARD_MU,FORWARD_MU_PERATOM};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDipole::PPPMDipole(LAMMPS *lmp) : PPPM(lmp),
+  densityx_brick_dipole(NULL), densityy_brick_dipole(NULL), 
+  densityz_brick_dipole(NULL), ux_brick_dipole(NULL), 
+  uy_brick_dipole(NULL), uz_brick_dipole(NULL), vdxx_brick_dipole(NULL), 
+  vdxy_brick_dipole(NULL), vdyy_brick_dipole(NULL), 
+  vdxz_brick_dipole(NULL), vdyz_brick_dipole(NULL), 
+  vdzz_brick_dipole(NULL), v0x_brick_dipole(NULL), v1x_brick_dipole(NULL), 
+  v2x_brick_dipole(NULL), v3x_brick_dipole(NULL), v4x_brick_dipole(NULL), 
+  v5x_brick_dipole(NULL), v0y_brick_dipole(NULL), v1y_brick_dipole(NULL), 
+  v2y_brick_dipole(NULL), v3y_brick_dipole(NULL), v4y_brick_dipole(NULL), 
+  v5y_brick_dipole(NULL), v0z_brick_dipole(NULL), v1z_brick_dipole(NULL), 
+  v2z_brick_dipole(NULL), v3z_brick_dipole(NULL), v4z_brick_dipole(NULL), 
+  v5z_brick_dipole(NULL), work3(NULL), work4(NULL), 
+  densityx_fft_dipole(NULL), densityy_fft_dipole(NULL), 
+  densityz_fft_dipole(NULL)
+{
+  dipoleflag = 1;
+  group_group_enable = 0;
+
+  cg_dipole = NULL;
+  cg_peratom_dipole = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+PPPMDipole::~PPPMDipole()
+{
+  if (copymode) return;
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  fft1 = NULL;
+  fft2 = NULL;
+  remap = NULL;
+  cg_dipole = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void PPPMDipole::init()
+{
+  if (me == 0) {
+    if (screen) fprintf(screen,"PPPMDipole initialization ...\n");
+    if (logfile) fprintf(logfile,"PPPMDipole initialization ...\n");
+  }
+
+  // error check
+
+  dipoleflag = atom->mu?1:0;
+  qsum_qsq(0); // q[i] might not be declared ?
+
+  if (dipoleflag && q2)
+    error->all(FLERR,"Cannot (yet) use charges with Kspace style PPPMDipole");
+
+  triclinic_check();
+
+  if (triclinic != domain->triclinic)
+    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");
+
+  if (domain->dimension == 2)
+    error->all(FLERR,"Cannot use PPPMDipole with 2d simulation");
+
+  if (comm->style != 0)
+    error->universe_all(FLERR,"PPPMDipole can only currently be used with "
+                        "comm_style brick");
+
+  if (!atom->mu) error->all(FLERR,"Kspace style requires atom attribute mu");
+
+  if (atom->mu && differentiation_flag == 1) error->all(FLERR,"Cannot (yet) use kspace_modify diff"
+       " ad with dipoles");
+
+  if (dipoleflag && strcmp(update->unit_style,"electron") == 0)
+    error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipole");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPMDipole");
+  }
+
+  if (order < 2 || order > MAXORDER) {
+    char str[128];
+    sprintf(str,"PPPMDipole order cannot be < 2 or > than %d",MAXORDER);
+    error->all(FLERR,str);
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot yet use triclinic cells with PPPMDipole");
+
+  pair_check();
+
+  int itmp = 0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+  // qdist = offset only for TIP4P fictitious charge
+
+  qdist = 0.0;
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with PPPMDipole");
+
+  // compute musum & musqsum and warn if no dipoles
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  musum_musq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil does not extend beyond neighbor proc
+  //   or overlap is allowed, then done
+  // else reduce order and try again
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  GridComm *cgtmp = NULL;
+  int iteration = 0;
+
+  while (order >= minorder) {
+    if (iteration && me == 0)
+      error->warning(FLERR,"Reducing PPPMDipole order b/c stencil extends "
+                     "beyond nearest neighbor processor");
+
+    compute_gf_denom();
+    set_grid_global();
+    set_grid_local();
+    if (overlap_allowed) break;
+
+    cgtmp = new GridComm(lmp,world,1,1,
+                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    cgtmp->ghost_notify();
+    if (!cgtmp->ghost_overlap()) break;
+    delete cgtmp;
+
+    order--;
+    iteration++;
+  }
+
+  if (order < minorder) error->all(FLERR,"PPPMDipole order < minimum allowed order");
+  if (!overlap_allowed && cgtmp->ghost_overlap())
+    error->all(FLERR,"PPPMDipole grid stencil extends "
+               "beyond nearest neighbor processor");
+  if (cgtmp) delete cgtmp;
+
+  // adjust g_ewald
+
+  if (!gewaldflag) adjust_gewald();
+
+  // calculate the final accuracy
+
+  double estimated_accuracy = final_accuracy_dipole();
+
+  // print stats
+
+  int ngrid_max,nfft_both_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+
+#ifdef FFT_SINGLE
+    const char fft_prec[] = "single";
+#else
+    const char fft_prec[] = "double";
+#endif
+
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(screen,"  stencil order = %d\n",order);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  using %s precision FFTs\n",fft_prec);
+      fprintf(screen,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(logfile,"  stencil order = %d\n",order);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  using %s precision FFTs\n",fft_prec);
+      fprintf(logfile,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+  }
+
+  // allocate K-space dependent memory
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+  cg_dipole->ghost_notify();
+  cg_dipole->setup();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPMDipole coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void PPPMDipole::setup()
+{
+  // perform some checks to avoid illegal boundaries with read_data
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipole");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPMDipole");
+  }
+
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPMDipole
+  // z dimension for 3d PPPMDipole is zprd since slab_volfactor = 1.0
+
+  prd = domain->prd;
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  double unitkx = (MY_2PI/xprd);
+  double unitky = (MY_2PI/yprd);
+  double unitkz = (MY_2PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per;
+
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+        if (sqk == 0.0) {
+          vg[n][0] = 0.0;
+          vg[n][1] = 0.0;
+          vg[n][2] = 0.0;
+          vg[n][3] = 0.0;
+          vg[n][4] = 0.0;
+          vg[n][5] = 0.0;
+        } else {
+          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+          vg[n][3] = vterm*fkx[i]*fky[j];
+          vg[n][4] = vterm*fkx[i]*fkz[k];
+          vg[n][5] = vterm*fky[j]*fkz[k];
+        }
+        n++;
+      }
+    }
+  }
+
+  compute_gf_dipole();
+}
+
+/* ----------------------------------------------------------------------
+   reset local grid arrays and communication stencils
+   called by fix balance b/c it changed sizes of processor sub-domains
+------------------------------------------------------------------------- */
+
+void PPPMDipole::setup_grid()
+{
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // reset portion of global grid that each proc owns
+
+  set_grid_local();
+
+  // reallocate K-space dependent memory
+  // check if grid communication is now overlapping if not allowed
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+
+  cg_dipole->ghost_notify();
+  if (overlap_allowed == 0 && cg_dipole->ghost_overlap())
+    error->all(FLERR,"PPPMDipole grid stencil extends "
+               "beyond nearest neighbor processor");
+  cg_dipole->setup();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+
+  // pre-compute volume-dependent coeffs
+
+  setup();
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMDipole long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDipole::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  if (vflag_atom)
+    error->all(FLERR,"Cannot (yet) compute per-atom virial "
+                       "with kspace style pppm/dipole");
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    cg_peratom_dipole->ghost_notify();
+    cg_peratom_dipole->setup();
+  }
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    musum_musq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no dipoles
+
+  if (musqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  boxlo = domain->boxlo;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm_dipole:part2grid");
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid
+
+  particle_map();
+  make_rho_dipole();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  cg_dipole->reverse_comm(this,REVERSE_MU);
+  brick2fft_dipole();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atom calculations via poisson_peratom()
+
+  poisson_ik_dipole();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  cg_dipole->forward_comm(this,FORWARD_MU);
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) {
+    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
+  }
+
+  // calculate the force on my particles
+
+  fieldforce_ik_dipole();
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom_dipole();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  const double qscale = qqrd2e * scale;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= qscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    double *q = atom->q;
+    double **mu = atom->mu;
+    int nlocal = atom->nlocal;
+    int ntotal = nlocal;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] *= 0.5;
+        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
+        eatom[i] *= qscale;
+      }
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < nlocal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::allocate()
+{
+  memory->create3d_offset(densityx_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:densityx_brick_dipole");
+  memory->create3d_offset(densityy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:densityy_brick_dipole");
+  memory->create3d_offset(densityz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:densityz_brick_dipole");
+
+  memory->create(densityx_fft_dipole,nfft_both,"pppm_dipole:densityy_fft_dipole");
+  memory->create(densityy_fft_dipole,nfft_both,"pppm_dipole:densityy_fft_dipole");
+  memory->create(densityz_fft_dipole,nfft_both,"pppm_dipole:densityz_fft_dipole");
+
+  memory->create(greensfn,nfft_both,"pppm_dipole:greensfn");
+  memory->create(work1,2*nfft_both,"pppm_dipole:work1");
+  memory->create(work2,2*nfft_both,"pppm_dipole:work2");
+  memory->create(work3,2*nfft_both,"pppm_dipole:work3");
+  memory->create(work4,2*nfft_both,"pppm_dipole:work4");
+  memory->create(vg,nfft_both,6,"pppm_dipole:vg");
+
+  memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm_dipole:fkx");
+  memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm_dipole:fky");
+  memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm_dipole:fkz");
+
+  memory->create3d_offset(ux_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:ux_brick_dipole");
+  memory->create3d_offset(uy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:uy_brick_dipole");
+  memory->create3d_offset(uz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:uz_brick_dipole");
+
+  memory->create3d_offset(vdxx_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdxx_brick_dipole");
+  memory->create3d_offset(vdxy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdxy_brick_dipole");
+  memory->create3d_offset(vdyy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdyy_brick_dipole");
+  memory->create3d_offset(vdxz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdxz_brick_dipole");
+  memory->create3d_offset(vdyz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdyz_brick_dipole");
+  memory->create3d_offset(vdzz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdzz_brick_dipole");
+
+  // summation coeffs
+
+  order_allocated = order;
+  if (!gf_b) memory->create(gf_b,order,"pppm_dipole:gf_b");
+  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm_dipole:rho1d");
+  memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm_dipole:drho1d");
+  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm_dipole:rho_coeff");
+  memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
+                          "pppm_dipole:drho_coeff");
+
+  // create 2 FFTs and a Remap
+  // 1st FFT keeps data in FFT decompostion
+  // 2nd FFT returns data in 3d brick decomposition
+  // remap takes data from 3d brick to FFT decomposition
+
+  int tmp;
+
+  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   0,0,&tmp,collective_flag);
+
+  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                   0,0,&tmp,collective_flag);
+
+  remap = new Remap(lmp,world,
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                    1,0,0,FFT_PRECISION,collective_flag);
+
+  // create ghost grid object for rho and electric field communication
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  cg_dipole = new GridComm(lmp,world,9,3,
+                           nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                           nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+}
+
+/* ----------------------------------------------------------------------
+   deallocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::deallocate()
+{
+  memory->destroy3d_offset(densityx_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(densityy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(densityz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(ux_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(uy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(uz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(vdxx_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdxy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdyy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdxz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdyz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdzz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy(densityx_fft_dipole);
+  memory->destroy(densityy_fft_dipole);
+  memory->destroy(densityz_fft_dipole);
+
+  memory->destroy(greensfn);
+  memory->destroy(work1);
+  memory->destroy(work2);
+  memory->destroy(work3);
+  memory->destroy(work4);
+  memory->destroy(vg);
+
+  memory->destroy1d_offset(fkx,nxlo_fft);
+  memory->destroy1d_offset(fky,nylo_fft);
+  memory->destroy1d_offset(fkz,nzlo_fft);
+
+  memory->destroy(gf_b);
+  memory->destroy2d_offset(rho1d,-order_allocated/2);
+  memory->destroy2d_offset(drho1d,-order_allocated/2);
+  memory->destroy2d_offset(rho_coeff,(1-order_allocated)/2);
+  memory->destroy2d_offset(drho_coeff,(1-order_allocated)/2);
+
+  delete fft1;
+  delete fft2;
+  delete remap;
+  delete cg_dipole;
+}
+
+/* ----------------------------------------------------------------------
+   allocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::allocate_peratom()
+{
+  peratom_allocate_flag = 1;
+
+  memory->create3d_offset(v0x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v0x_brick_dipole");
+  memory->create3d_offset(v1x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v1x_brick_dipole");
+  memory->create3d_offset(v2x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v2x_brick_dipole");
+  memory->create3d_offset(v3x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v3x_brick_dipole");
+  memory->create3d_offset(v4x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v4x_brick_dipole");
+  memory->create3d_offset(v5x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v5x_brick_dipole");
+
+  memory->create3d_offset(v0y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v0y_brick_dipole");
+  memory->create3d_offset(v1y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v1y_brick_dipole");
+  memory->create3d_offset(v2y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v2y_brick_dipole");
+  memory->create3d_offset(v3y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v3y_brick_dipole");
+  memory->create3d_offset(v4y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v4y_brick_dipole");
+  memory->create3d_offset(v5y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v5y_brick_dipole");
+
+  memory->create3d_offset(v0z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v0z_brick_dipole");
+  memory->create3d_offset(v1z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v1z_brick_dipole");
+  memory->create3d_offset(v2z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v2z_brick_dipole");
+  memory->create3d_offset(v3z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v3z_brick_dipole");
+  memory->create3d_offset(v4z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v4z_brick_dipole");
+  memory->create3d_offset(v5z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v5z_brick_dipole");
+
+  // create ghost grid object for rho and electric field communication
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  cg_peratom_dipole =
+    new GridComm(lmp,world,18,1,
+                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+}
+
+/* ----------------------------------------------------------------------
+   deallocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::deallocate_peratom()
+{
+  peratom_allocate_flag = 0;
+
+  memory->destroy3d_offset(v0x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(v0y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(v0z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  delete cg_peratom_dipole;
+}
+
+/* ----------------------------------------------------------------------
+   set global size of PPPMDipole grid = nx,ny,nz_pppm
+   used for charge accumulation, FFTs, and electric field interpolation
+------------------------------------------------------------------------- */
+
+void PPPMDipole::set_grid_global()
+{
+  // use xprd,yprd,zprd
+  // adjust z dimension for 2d slab PPPMDipole
+  // 3d PPPMDipole just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  double h;
+  bigint natoms = atom->natoms;
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    if (mu2 == 0.0)
+     error->all(FLERR,"Must use kspace_modify gewald for systems with no dipoles");
+    g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    double g_ewald_new =
+      find_gewald_dipole(g_ewald,cutoff,natoms,xprd*yprd*zprd,mu2);
+    if (g_ewald_new > 0.0) g_ewald = g_ewald_new;
+    else error->warning(FLERR,"PPPMDipole dipole Newton solver failed, "
+                        "using old method to estimate g_ewald");
+  }
+
+  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
+  // nz_pppm uses extended zprd_slab instead of zprd
+  // reduce it until accuracy target is met
+
+  if (!gridflag) {
+
+    h = h_x = h_y = h_z = 4.0/g_ewald;
+    int count = 0;
+    while (1) {
+
+      // set grid dimension
+      
+      nx_pppm = static_cast<int> (xprd/h_x);
+      ny_pppm = static_cast<int> (yprd/h_y);
+      nz_pppm = static_cast<int> (zprd_slab/h_z);
+
+      if (nx_pppm <= 1) nx_pppm = 2;
+      if (ny_pppm <= 1) ny_pppm = 2;
+      if (nz_pppm <= 1) nz_pppm = 2;
+
+      // set local grid dimension
+      
+      int npey_fft,npez_fft;
+      if (nz_pppm >= nprocs) {
+        npey_fft = 1;
+        npez_fft = nprocs;
+      } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+      int me_y = me % npey_fft;
+      int me_z = me / npey_fft;
+
+      nxlo_fft = 0;
+      nxhi_fft = nx_pppm - 1;
+      nylo_fft = me_y*ny_pppm/npey_fft;
+      nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
+      nzlo_fft = me_z*nz_pppm/npez_fft;
+      nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+
+      double df_kspace = compute_df_kspace_dipole();
+
+      count++;
+
+      // break loop if the accuracy has been reached or
+      // too many loops have been performed
+
+      if (df_kspace <= accuracy) break;
+      if (count > 500) error->all(FLERR, "Could not compute grid size");
+      h *= 0.95;
+      h_x = h_y = h_z = h;
+    }
+  }
+
+  // boost grid size until it is factorable
+
+  while (!factorable(nx_pppm)) nx_pppm++;
+  while (!factorable(ny_pppm)) ny_pppm++;
+  while (!factorable(nz_pppm)) nz_pppm++;
+
+  h_x = xprd/nx_pppm;
+  h_y = yprd/ny_pppm;
+  h_z = zprd_slab/nz_pppm;
+
+  if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
+    error->all(FLERR,"PPPMDipole grid is too large");
+}
+
+
+/* ----------------------------------------------------------------------
+   compute estimated kspace force error for dipoles
+------------------------------------------------------------------------- */
+
+double PPPMDipole::compute_df_kspace_dipole()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+  bigint natoms = atom->natoms;
+  double qopt = compute_qopt_dipole();
+  double df_kspace = sqrt(qopt/natoms)*mu2/(3.0*xprd*yprd*zprd_slab);
+  return df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   compute qopt for dipoles with ik differentiation
+------------------------------------------------------------------------- */
+
+double PPPMDipole::compute_qopt_dipole()
+{
+  double qopt = 0.0;
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz;
+  double cnx,cny,cnz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,sum2,dot1,dot2;
+  double numerator,denominator;
+  double u1,u2,u3,sqk;
+
+  int k,l,m,nx,ny,nz,kper,lper,mper;
+
+  const int nbx = 2;
+  const int nby = 2;
+  const int nbz = 2;
+
+  const int twoorder = 2*order;
+
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+    cnz = cos(0.5*unitkz*mper*zprd_slab/nz_pppm);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+      cny = cos(0.5*unitky*lper*yprd/ny_pppm);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+        cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
+
+        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+        if (sqk != 0.0) {
+          numerator = MY_4PI/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+          sum2 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            qx = unitkx*(kper+nx_pppm*nx);
+            sx = exp(-0.25*square(qx/g_ewald));
+            argx = 0.5*qx*xprd/nx_pppm;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              qy = unitky*(lper+ny_pppm*ny);
+              sy = exp(-0.25*square(qy/g_ewald));
+              argy = 0.5*qy*yprd/ny_pppm;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                qz = unitkz*(mper+nz_pppm*nz);
+                sz = exp(-0.25*square(qz/g_ewald));
+                argz = 0.5*qz*zprd_slab/nz_pppm;
+                wz = powsinxx(argz,twoorder);
+
+                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+                dot2 = qx*qx + qy*qy + qz*qz;
+                //dot1 = dot1*dot1*dot1; // power of 3 for dipole forces
+                //dot2 = dot2*dot2*dot2;
+                u1 = sx*sy*sz;
+                const double w2 = wx*wy*wz;
+                const double phi = u1*MY_4PI/dot2;
+                const double top = dot1*dot1*dot1*w2*phi;
+                sum1 += phi*phi*dot2*dot2*dot2;
+                sum2 += top*top/sqk/sqk/sqk;
+              }
+            }
+          }
+          qopt += sum1 - sum2/denominator;
+        }
+      }
+    }
+  }
+  double qopt_all;
+  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
+  return qopt_all;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+------------------------------------------------------------------------- */
+
+void PPPMDipole::compute_gf_dipole()
+{
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz;
+  double cnx,cny,cnz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+  double sqk;
+
+  int k,l,m,n,nx,ny,nz,kper,lper,mper;
+
+  int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                              pow(-log(EPS_HOC),0.25));
+  int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                              pow(-log(EPS_HOC),0.25));
+  int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                              pow(-log(EPS_HOC),0.25));
+  nbx = MAX(nbx,2);
+  nby = MAX(nby,2);
+  nbz = MAX(nbz,2);
+  const int twoorder = 2*order;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+    cnz = cos(0.5*unitkz*mper*zprd_slab/nz_pppm);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+      cny = cos(0.5*unitky*lper*yprd/ny_pppm);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+        cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
+
+        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+        if (sqk != 0.0) {
+          numerator = MY_4PI/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            qx = unitkx*(kper+nx_pppm*nx);
+            sx = exp(-0.25*square(qx/g_ewald));
+            argx = 0.5*qx*xprd/nx_pppm;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              qy = unitky*(lper+ny_pppm*ny);
+              sy = exp(-0.25*square(qy/g_ewald));
+              argy = 0.5*qy*yprd/ny_pppm;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                qz = unitkz*(mper+nz_pppm*nz);
+                sz = exp(-0.25*square(qz/g_ewald));
+                argz = 0.5*qz*zprd_slab/nz_pppm;
+                wz = powsinxx(argz,twoorder);
+
+                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+                dot2 = qx*qx + qy*qy + qz*qz;
+                const double u1 = sx*sy*sz;
+                const double w2 = wx*wy*wz;
+                const double phi = u1*MY_4PI/dot2;
+                sum1 += dot1*dot1*dot1*w2*phi/sqk/sqk/sqk;
+              }
+            }
+          }
+          greensfn[n++] = sum1/denominator;
+        } else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   calculate f(x) for use in Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+double PPPMDipole::newton_raphson_f()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atom->natoms;
+
+  double df_rspace,df_kspace;
+  double vol = xprd*yprd*zprd;
+  double a = cutoff*g_ewald;
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  df_rspace = (mu2/(sqrt(vol*powint(g_ewald,4)*powint(cutoff,9)*natoms)) *
+      sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) * exp(-rg2));
+  df_kspace = compute_df_kspace_dipole();
+
+  return df_rspace - df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   find g_ewald parameter for dipoles based on desired accuracy
+   using a Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+double PPPMDipole::find_gewald_dipole(double x, double Rc,
+                              bigint natoms, double vol, double b2)
+{
+  double dx,tol;
+  int maxit;
+
+  maxit = 10000; //Maximum number of iterations
+  tol = 0.00001; //Convergence tolerance
+
+  //Begin algorithm
+
+  for (int i = 0; i < maxit; i++) {
+    dx = newton_raphson_f_dipole(x,Rc,natoms,vol,b2) / derivf_dipole(x,Rc,natoms,vol,b2);
+    x = x - dx; //Update x
+    if (fabs(dx) < tol) return x;
+    if (x < 0 || x != x) // solver failed
+      return -1;
+  }
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+   calculate f(x) objective function for dipoles
+ ------------------------------------------------------------------------- */
+
+double PPPMDipole::newton_raphson_f_dipole(double x, double Rc, bigint
+natoms, double vol, double b2)
+{
+  double a = Rc*x;
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  double f = (b2/(sqrt(vol*powint(x,4)*powint(Rc,9)*natoms)) *
+    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
+    exp(-rg2)) - accuracy;
+
+  return f;
+}
+
+/* ----------------------------------------------------------------------
+   calculate numerical derivative f'(x) of objective function for dipoles
+ ------------------------------------------------------------------------- */
+
+double PPPMDipole::derivf_dipole(double x, double Rc,
+                         bigint natoms, double vol, double b2)
+{
+  double h = 0.000001;  //Derivative step-size
+  return (newton_raphson_f_dipole(x + h,Rc,natoms,vol,b2) - newton_raphson_f_dipole(x,Rc,natoms,vol,b2)) / h;
+}
+
+/* ----------------------------------------------------------------------
+   calculate the final estimate of the accuracy
+------------------------------------------------------------------------- */
+
+double PPPMDipole::final_accuracy_dipole()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double vol = xprd*yprd*zprd;
+  bigint natoms = atom->natoms;
+  if (natoms == 0) natoms = 1; // avoid division by zero
+
+  double df_kspace = compute_df_kspace_dipole();
+
+  double a = cutoff*g_ewald;
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  double df_rspace = (mu2/(sqrt(vol*powint(g_ewald,4)*powint(cutoff,9)*natoms)) *
+    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
+    exp(-rg2));
+
+  double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace);
+
+  return estimated_accuracy;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
+------------------------------------------------------------------------- */
+
+void PPPMDipole::compute_gf_denom()
+{
+  if (gf_b) memory->destroy(gf_b);
+  memory->create(gf_b,order,"pppm_dipole:gf_b");
+
+  int k,l,m;
+
+  for (l = 1; l < order; l++) gf_b[l] = 0.0;
+  gf_b[0] = 1.0;
+
+  for (m = 1; m < order; m++) {
+    for (l = m; l > 0; l--)
+      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
+    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
+  }
+
+  bigint ifact = 1;
+  for (k = 1; k < 2*order; k++) ifact *= k;
+  double gaminv = 1.0/ifact;
+  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMDipole::make_rho_dipole()
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR x1,y1,z1;
+  FFT_SCALAR x2,y2,z2;
+
+  // clear 3d density array
+
+  memset(&(densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **mu = atom->mu;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    z0 = delvolinv * mu[i][0];
+    z1 = delvolinv * mu[i][1];
+    z2 = delvolinv * mu[i][2];
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*rho1d[2][n];
+      y1 = z1*rho1d[2][n];
+      y2 = z2*rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*rho1d[1][m];
+        x1 = y1*rho1d[1][m];
+        x2 = y2*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          densityx_brick_dipole[mz][my][mx] += x0*rho1d[0][l];
+          densityy_brick_dipole[mz][my][mx] += x1*rho1d[0][l];
+          densityz_brick_dipole[mz][my][mx] += x2*rho1d[0][l];
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   remap density from 3d brick decomposition to FFT decomposition
+------------------------------------------------------------------------- */
+
+void PPPMDipole::brick2fft_dipole()
+{
+  int n,ix,iy,iz;
+
+  // copy grabs inner portion of density from 3d brick
+  // remap could be done as pre-stage of FFT,
+  //   but this works optimally on only double values, not complex values
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        densityx_fft_dipole[n] = densityx_brick_dipole[iz][iy][ix];
+        densityy_fft_dipole[n] = densityy_brick_dipole[iz][iy][ix];
+        densityz_fft_dipole[n] = densityz_brick_dipole[iz][iy][ix];
+        n++;
+      }
+
+  remap->perform(densityx_fft_dipole,densityx_fft_dipole,work1);
+  remap->perform(densityy_fft_dipole,densityy_fft_dipole,work1);
+  remap->perform(densityz_fft_dipole,densityz_fft_dipole,work1);
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik
+------------------------------------------------------------------------- */
+
+void PPPMDipole::poisson_ik_dipole()
+{
+  int i,j,k,n,ii;
+  double eng;
+  double wreal,wimg;
+
+  // transform dipole density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n] = densityx_fft_dipole[i];
+    work1[n+1] = ZEROF;
+    work2[n] = densityy_fft_dipole[i];
+    work2[n+1] = ZEROF;
+    work3[n] = densityz_fft_dipole[i];
+    work3[n+1] = ZEROF;
+    n += 2;
+  }
+
+  fft1->compute(work1,work1,1);
+  fft1->compute(work2,work2,1);
+  fft1->compute(work3,work3,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      ii = 0;
+      for (k = nzlo_fft; k <= nzhi_fft; k++)
+        for (j = nylo_fft; j <= nyhi_fft; j++)
+          for (i = nxlo_fft; i <= nxhi_fft; i++) {
+            wreal = (work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+            wimg = (work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+            eng = s2 * greensfn[ii] * (wreal*wreal + wimg*wimg);
+            for (int jj = 0; jj < 6; jj++) virial[jj] += eng*vg[ii][jj];
+            virial[0] += 2.0*s2*greensfn[ii]*fkx[i]*(work1[n]*wreal + work1[n+1]*wimg);
+            virial[1] += 2.0*s2*greensfn[ii]*fky[j]*(work2[n]*wreal + work2[n+1]*wimg);
+            virial[2] += 2.0*s2*greensfn[ii]*fkz[k]*(work3[n]*wreal + work3[n+1]*wimg);
+            virial[3] += 2.0*s2*greensfn[ii]*fky[j]*(work1[n]*wreal + work1[n+1]*wimg);
+            virial[4] += 2.0*s2*greensfn[ii]*fkz[k]*(work1[n]*wreal + work1[n+1]*wimg);
+            virial[5] += 2.0*s2*greensfn[ii]*fkz[k]*(work2[n]*wreal + work2[n+1]*wimg);
+            if (eflag_global) energy += eng;
+            ii++;
+            n += 2;
+          }
+    } else {
+      n = 0;
+      ii = 0;
+      for (k = nzlo_fft; k <= nzhi_fft; k++)
+        for (j = nylo_fft; j <= nyhi_fft; j++)
+          for (i = nxlo_fft; i <= nxhi_fft; i++) {
+            wreal = (work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+            wimg = (work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+            energy +=
+            s2 * greensfn[ii] * (wreal*wreal + wimg*wimg);
+   ii++;
+            n += 2;
+          }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n]   *= scaleinv * greensfn[i];
+    work1[n+1] *= scaleinv * greensfn[i];
+    work2[n]   *= scaleinv * greensfn[i];
+    work2[n+1] *= scaleinv * greensfn[i];
+    work3[n]   *= scaleinv * greensfn[i];
+    work3[n+1] *= scaleinv * greensfn[i];
+    n += 2;
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (vflag_atom) poisson_peratom_dipole();
+
+  // compute electric potential
+  // FFT leaves data in 3d brick decomposition
+
+  // Ex
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        work4[n+1] = fkx[i]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        ux_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Ey
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        work4[n+1] = fky[j]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        uy_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Ez
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        work4[n+1] = fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        uz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vxx
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*fkx[i]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkx[i]*fkx[i]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxx_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vyy
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*fky[j]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fky[j]*fky[j]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdyy_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vzz
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkz[k]*fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdzz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vxy
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*fky[j]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkx[i]*fky[j]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxy_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vxz
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkx[i]*fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vyz
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fky[j]*fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdyz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPMDipole::poisson_peratom_dipole()
+{
+  int i,ii,j,k,n;
+
+  // 18 components of virial in v0 thru v5
+
+  if (!vflag_atom) return;
+
+  // V0x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
+        work4[n+1] = fkx[i]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V0y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
+        work4[n+1] = fky[j]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V0z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
+        work4[n+1] = fkz[k]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V1x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
+        work4[n+1] = fkx[i]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V1y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
+        work4[n+1] = fky[j]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V1z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
+        work4[n+1] = fkz[k]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V2x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
+        work4[n+1] = fkx[i]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V2y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
+        work4[n+1] = fky[j]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V2z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
+        work4[n+1] = fkz[k]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V3x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
+        work4[n+1] = fkx[i]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V3y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
+        work4[n+1] = fky[j]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V3z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
+        work4[n+1] = fkz[k]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V4x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
+        work4[n+1] = fkx[i]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V4y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
+        work4[n+1] = fky[j]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V4z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
+        work4[n+1] = fkz[k]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V5x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
+        work4[n+1] = fkx[i]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V5y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
+        work4[n+1] = fky[j]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V5z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
+        work4[n+1] = fkz[k]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+void PPPMDipole::fieldforce_ik_dipole()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR ex,ey,ez;
+  FFT_SCALAR vxx,vyy,vzz,vxy,vxz,vyz;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+
+  double **mu = atom->mu;
+  double **x = atom->x;
+  double **f = atom->f;
+  double **t = atom->torque;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ex = ey = ez = ZEROF;
+    vxx = vyy = vzz = vxy = vxz = vyz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          ex -= x0*ux_brick_dipole[mz][my][mx];
+          ey -= x0*uy_brick_dipole[mz][my][mx];
+          ez -= x0*uz_brick_dipole[mz][my][mx];
+          vxx -= x0*vdxx_brick_dipole[mz][my][mx];
+          vyy -= x0*vdyy_brick_dipole[mz][my][mx];
+          vzz -= x0*vdzz_brick_dipole[mz][my][mx];
+          vxy -= x0*vdxy_brick_dipole[mz][my][mx];
+          vxz -= x0*vdxz_brick_dipole[mz][my][mx];
+          vyz -= x0*vdyz_brick_dipole[mz][my][mx];
+        }
+      }
+    }
+
+    // convert E-field to torque
+
+    const double mufactor = qqrd2e * scale;
+    f[i][0] += mufactor*(vxx*mu[i][0] + vxy*mu[i][1] + vxz*mu[i][2]);
+    f[i][1] += mufactor*(vxy*mu[i][0] + vyy*mu[i][1] + vyz*mu[i][2]);
+    f[i][2] += mufactor*(vxz*mu[i][0] + vyz*mu[i][1] + vzz*mu[i][2]);
+
+    t[i][0] += mufactor*(mu[i][1]*ez - mu[i][2]*ey);
+    t[i][1] += mufactor*(mu[i][2]*ex - mu[i][0]*ez);
+    t[i][2] += mufactor*(mu[i][0]*ey - mu[i][1]*ex);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPMDipole::fieldforce_peratom_dipole()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ux,uy,uz;
+  FFT_SCALAR v0x,v1x,v2x,v3x,v4x,v5x;
+  FFT_SCALAR v0y,v1y,v2y,v3y,v4y,v5y;
+  FFT_SCALAR v0z,v1z,v2z,v3z,v4z,v5z;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **mu = atom->mu;
+  double **x = atom->x;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ux = uy = uz = ZEROF;
+    v0x = v1x = v2x = v3x = v4x = v5x = ZEROF;
+    v0y = v1y = v2y = v3y = v4y = v5y = ZEROF;
+    v0z = v1z = v2z = v3z = v4z = v5z = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          if (eflag_atom) {
+            ux += x0*ux_brick_dipole[mz][my][mx];
+            uy += x0*uy_brick_dipole[mz][my][mx];
+            uz += x0*uz_brick_dipole[mz][my][mx];
+          }
+          if (vflag_atom) {
+            v0x += x0*v0x_brick_dipole[mz][my][mx];
+            v1x += x0*v1x_brick_dipole[mz][my][mx];
+            v2x += x0*v2x_brick_dipole[mz][my][mx];
+            v3x += x0*v3x_brick_dipole[mz][my][mx];
+            v4x += x0*v4x_brick_dipole[mz][my][mx];
+            v5x += x0*v5x_brick_dipole[mz][my][mx];
+            v0y += x0*v0y_brick_dipole[mz][my][mx];
+            v1y += x0*v1y_brick_dipole[mz][my][mx];
+            v2y += x0*v2y_brick_dipole[mz][my][mx];
+            v3y += x0*v3y_brick_dipole[mz][my][mx];
+            v4y += x0*v4y_brick_dipole[mz][my][mx];
+            v5y += x0*v5y_brick_dipole[mz][my][mx];
+            v0z += x0*v0z_brick_dipole[mz][my][mx];
+            v1z += x0*v1z_brick_dipole[mz][my][mx];
+            v2z += x0*v2z_brick_dipole[mz][my][mx];
+            v3z += x0*v3z_brick_dipole[mz][my][mx];
+            v4z += x0*v4z_brick_dipole[mz][my][mx];
+            v5z += x0*v5z_brick_dipole[mz][my][mx];
+          }
+        }
+      }
+    }
+
+    if (eflag_atom) eatom[i] += mu[i][0]*ux + mu[i][1]*uy + mu[i][2]*uz;
+    if (vflag_atom) {
+      vatom[i][0] += mu[i][0]*v0x + mu[i][1]*v0y + mu[i][2]*v0z;
+      vatom[i][1] += mu[i][0]*v1x + mu[i][1]*v1y + mu[i][2]*v1z;
+      vatom[i][2] += mu[i][0]*v2x + mu[i][1]*v2y + mu[i][2]*v2z;
+      vatom[i][3] += mu[i][0]*v3x + mu[i][1]*v3y + mu[i][2]*v3z;
+      vatom[i][4] += mu[i][0]*v4x + mu[i][1]*v4y + mu[i][2]*v4z;
+      vatom[i][5] += mu[i][0]*v5x + mu[i][1]*v5y + mu[i][2]*v5z;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack own values to buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPMDipole::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if (flag == FORWARD_MU) {
+    FFT_SCALAR *src_ux = &ux_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_uy = &uy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_uz = &uz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vxx = &vdxx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vyy = &vdyy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vzz = &vdzz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vxy = &vdxy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vxz = &vdxz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vyz = &vdyz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = src_ux[list[i]];
+      buf[n++] = src_uy[list[i]];
+      buf[n++] = src_uz[list[i]];
+      buf[n++] = src_vxx[list[i]];
+      buf[n++] = src_vyy[list[i]];
+      buf[n++] = src_vzz[list[i]];
+      buf[n++] = src_vxy[list[i]];
+      buf[n++] = src_vxz[list[i]];
+      buf[n++] = src_vyz[list[i]];
+    }
+  } else if (flag == FORWARD_MU_PERATOM) {
+    FFT_SCALAR *v0xsrc = &v0x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1xsrc = &v1x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2xsrc = &v2x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3xsrc = &v3x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4xsrc = &v4x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5xsrc = &v5x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0ysrc = &v0y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1ysrc = &v1y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2ysrc = &v2y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3ysrc = &v3y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4ysrc = &v4y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5ysrc = &v5y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0zsrc = &v0z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1zsrc = &v1z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2zsrc = &v2z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3zsrc = &v3z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4zsrc = &v4z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5zsrc = &v5z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = v0xsrc[list[i]];
+      buf[n++] = v1xsrc[list[i]];
+      buf[n++] = v2xsrc[list[i]];
+      buf[n++] = v3xsrc[list[i]];
+      buf[n++] = v4xsrc[list[i]];
+      buf[n++] = v5xsrc[list[i]];
+      buf[n++] = v0ysrc[list[i]];
+      buf[n++] = v1ysrc[list[i]];
+      buf[n++] = v2ysrc[list[i]];
+      buf[n++] = v3ysrc[list[i]];
+      buf[n++] = v4ysrc[list[i]];
+      buf[n++] = v5ysrc[list[i]];
+      buf[n++] = v0zsrc[list[i]];
+      buf[n++] = v1zsrc[list[i]];
+      buf[n++] = v2zsrc[list[i]];
+      buf[n++] = v3zsrc[list[i]];
+      buf[n++] = v4zsrc[list[i]];
+      buf[n++] = v5zsrc[list[i]];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's own values from buf and set own ghost values
+------------------------------------------------------------------------- */
+
+void PPPMDipole::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if (flag == FORWARD_MU) {
+    FFT_SCALAR *dest_ux = &ux_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_uy = &uy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_uz = &uz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vxx = &vdxx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vyy = &vdyy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vzz = &vdzz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vxy = &vdxy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vxz = &vdxz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vyz = &vdyz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      dest_ux[list[i]] = buf[n++];
+      dest_uy[list[i]] = buf[n++];
+      dest_uz[list[i]] = buf[n++];
+      dest_vxx[list[i]] = buf[n++];
+      dest_vyy[list[i]] = buf[n++];
+      dest_vzz[list[i]] = buf[n++];
+      dest_vxy[list[i]] = buf[n++];
+      dest_vxz[list[i]] = buf[n++];
+      dest_vyz[list[i]] = buf[n++];
+    }
+  } else if (flag == FORWARD_MU_PERATOM) {
+    FFT_SCALAR *v0xsrc = &v0x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1xsrc = &v1x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2xsrc = &v2x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3xsrc = &v3x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4xsrc = &v4x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5xsrc = &v5x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0ysrc = &v0y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1ysrc = &v1y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2ysrc = &v2y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3ysrc = &v3y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4ysrc = &v4y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5ysrc = &v5y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0zsrc = &v0z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1zsrc = &v1z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2zsrc = &v2z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3zsrc = &v3z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4zsrc = &v4z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5zsrc = &v5z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      v0xsrc[list[i]] = buf[n++];
+      v1xsrc[list[i]] = buf[n++];
+      v2xsrc[list[i]] = buf[n++];
+      v3xsrc[list[i]] = buf[n++];
+      v4xsrc[list[i]] = buf[n++];
+      v5xsrc[list[i]] = buf[n++];
+      v0ysrc[list[i]] = buf[n++];
+      v1ysrc[list[i]] = buf[n++];
+      v2ysrc[list[i]] = buf[n++];
+      v3ysrc[list[i]] = buf[n++];
+      v4ysrc[list[i]] = buf[n++];
+      v5ysrc[list[i]] = buf[n++];
+      v0zsrc[list[i]] = buf[n++];
+      v1zsrc[list[i]] = buf[n++];
+      v2zsrc[list[i]] = buf[n++];
+      v3zsrc[list[i]] = buf[n++];
+      v4zsrc[list[i]] = buf[n++];
+      v5zsrc[list[i]] = buf[n++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack ghost values into buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPMDipole::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+  if (flag == REVERSE_MU) {
+    FFT_SCALAR *src_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_dipole1 = &densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_dipole2 = &densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = src_dipole0[list[i]];
+      buf[n++] = src_dipole1[list[i]];
+      buf[n++] = src_dipole2[list[i]];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's ghost values from buf and add to own values
+------------------------------------------------------------------------- */
+
+void PPPMDipole::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+  if (flag == REVERSE_MU) {
+    FFT_SCALAR *dest_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_dipole1 = &densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_dipole2 = &densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      dest_dipole0[list[i]] += buf[n++];
+      dest_dipole1[list[i]] += buf[n++];
+      dest_dipole2[list[i]] += buf[n++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPMDipole::slabcorr()
+{
+  // compute local contribution to global dipole moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double dipole = 0.0;
+  double **mu = atom->mu;
+  for (int i = 0; i < nlocal; i++) dipole += mu[i][2];
+
+  // sum local contributions to get global dipole moment
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  if (eflag_atom || fabs(qsum) > SMALL) {
+
+    error->all(FLERR,"Cannot (yet) use kspace slab correction with "
+      "long-range dipoles and non-neutral systems or per-atom energy");
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all/12.0)/volume;
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) energy += qscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = qscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++)
+      eatom[i] += efact * mu[i][2]*dipole_all;
+  }
+
+  // add on torque corrections
+
+  if (atom->torque) {
+    double ffact = qscale * (-4.0*MY_PI/volume);
+    double **mu = atom->mu;
+    double **torque = atom->torque;
+    for (int i = 0; i < nlocal; i++) {
+      torque[i][0] += ffact * dipole_all * mu[i][1];
+      torque[i][1] += -ffact * dipole_all * mu[i][0];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 1d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPMDipole::timing_1d(int n, double &time1d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->timing1d(work1,nfft_both,1);
+    fft1->timing1d(work1,nfft_both,1);
+    fft1->timing1d(work1,nfft_both,1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time1d = time2 - time1;
+
+  return 12;
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 3d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPMDipole::timing_3d(int n, double &time3d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->compute(work1,work1,1);
+    fft1->compute(work1,work1,1);
+    fft1->compute(work1,work1,1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time3d = time2 - time1;
+
+  return 12;
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+double PPPMDipole::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+  bytes += 6 * nfft_both * sizeof(double);   // vg
+  bytes += nfft_both * sizeof(double); 	     // greensfn
+  bytes += nfft_both*5 * sizeof(FFT_SCALAR); // work*2*2
+  bytes += 9 * nbrick * sizeof(FFT_SCALAR);  // ubrick*3 + vdbrick*6
+  bytes += nfft_both*7 * sizeof(FFT_SCALAR); // density_ffx*3 + work*2*2
+
+  if (peratom_allocate_flag)
+    bytes += 21 * nbrick * sizeof(FFT_SCALAR);
+
+  if (cg_dipole) bytes += cg_dipole->memory_usage();
+  if (cg_peratom_dipole) bytes += cg_peratom_dipole->memory_usage();
+
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+   compute musum,musqsum,mu2
+   called initially, when particle count changes, when dipoles are changed
+------------------------------------------------------------------------- */
+
+void PPPMDipole::musum_musq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->mu_flag) {
+    double** mu = atom->mu;
+    double musum_local(0.0), musqsum_local(0.0);
+
+    for (int i = 0; i < nlocal; i++) {
+      musum_local += mu[i][0] + mu[i][1] + mu[i][2];
+      musqsum_local += mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2];
+    }
+
+    MPI_Allreduce(&musum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&musqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    mu2 = musqsum * force->qqrd2e;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver PPPMDipole on system with no dipoles");
+}
diff --git a/src/KSPACE/pppm_dipole.h b/src/KSPACE/pppm_dipole.h
new file mode 100644
index 0000000000..d06919644b
--- /dev/null
+++ b/src/KSPACE/pppm_dipole.h
@@ -0,0 +1,217 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/dipole,PPPMDipole)
+
+#else
+
+#ifndef LMP_PPPM_DIPOLE_H
+#define LMP_PPPM_DIPOLE_H
+
+#include "pppm.h"
+
+namespace LAMMPS_NS {
+
+class PPPMDipole : public PPPM {
+ public:
+  PPPMDipole(class LAMMPS *);
+  virtual ~PPPMDipole();
+  void init();
+  void setup();
+  void setup_grid();
+  void compute(int, int);
+  int timing_1d(int, double &);
+  int timing_3d(int, double &);
+  double memory_usage();
+
+ protected:
+  void set_grid_global();
+  double newton_raphson_f(); 
+
+  void allocate();
+  void allocate_peratom();
+  void deallocate();
+  void deallocate_peratom();
+  void compute_gf_denom();
+
+  void slabcorr();
+
+  // grid communication
+
+  void pack_forward(int, FFT_SCALAR *, int, int *);
+  void unpack_forward(int, FFT_SCALAR *, int, int *);
+  void pack_reverse(int, FFT_SCALAR *, int, int *);
+  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+
+  // dipole
+
+  FFT_SCALAR ***densityx_brick_dipole,***densityy_brick_dipole,***densityz_brick_dipole;
+  FFT_SCALAR ***vdxx_brick_dipole,***vdyy_brick_dipole,***vdzz_brick_dipole;
+  FFT_SCALAR ***vdxy_brick_dipole,***vdxz_brick_dipole,***vdyz_brick_dipole;
+  FFT_SCALAR ***ux_brick_dipole,***uy_brick_dipole,***uz_brick_dipole;
+  FFT_SCALAR ***v0x_brick_dipole,***v1x_brick_dipole,***v2x_brick_dipole;
+  FFT_SCALAR ***v3x_brick_dipole,***v4x_brick_dipole,***v5x_brick_dipole;
+  FFT_SCALAR ***v0y_brick_dipole,***v1y_brick_dipole,***v2y_brick_dipole;
+  FFT_SCALAR ***v3y_brick_dipole,***v4y_brick_dipole,***v5y_brick_dipole;
+  FFT_SCALAR ***v0z_brick_dipole,***v1z_brick_dipole,***v2z_brick_dipole;
+  FFT_SCALAR ***v3z_brick_dipole,***v4z_brick_dipole,***v5z_brick_dipole;
+  FFT_SCALAR *work3,*work4;
+  FFT_SCALAR *densityx_fft_dipole,*densityy_fft_dipole,*densityz_fft_dipole;
+  class GridComm *cg_dipole;
+  class GridComm *cg_peratom_dipole;
+  int only_dipole_flag;
+  double musum,musqsum,mu2;
+  double find_gewald_dipole(double, double, bigint, double, double);
+  double newton_raphson_f_dipole(double, double, bigint, double, double);
+  double derivf_dipole(double, double, bigint, double, double);
+  double compute_df_kspace_dipole();
+  double compute_qopt_dipole();
+  void compute_gf_dipole();
+  void make_rho_dipole();
+  void brick2fft_dipole();
+  void poisson_ik_dipole();
+  void poisson_peratom_dipole();
+  void fieldforce_ik_dipole();
+  void fieldforce_peratom_dipole();
+  double final_accuracy_dipole();
+  void musum_musq();
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Cannot (yet) use charges with Kspace style PPPMDipole
+
+Charge-dipole interactions are not yet implemented in PPPMDipole so this
+feature is not yet supported.
+
+E: Must redefine kspace_style after changing to triclinic box
+
+Self-explanatory.
+
+E: Kspace style requires atom attribute mu
+
+The atom style defined does not have this attribute.
+
+E: Cannot (yet) use kspace_modify diff ad with dipoles
+
+This feature is not yet supported.
+
+E: Cannot (yet) use 'electron' units with dipoles
+
+This feature is not yet supported.
+
+E: Cannot yet use triclinic cells with PPPMDipole
+
+This feature is not yet supported.
+
+E: Cannot yet use TIP4P with PPPMDipole
+
+This feature is not yet supported.
+
+E: Cannot use nonperiodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range dipole components be used.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired. See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2. This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: Cannot (yet) compute per-atom virial with kspace style pppm/dipole
+
+This feature is not yet supported.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096. You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor. This is likely for one of two
+reasons, both of them bad. First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors. This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command. The safest settings are "delay 0
+every 1 check yes". Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+E: Using kspace solver PPPMDipole on system with no dipoles
+
+Must have non-zero dipoles with PPPMDipole.
+
+E: Must use kspace_modify gewald for system with no dipoles
+
+Self-explanatory.
+
+*/
diff --git a/src/KSPACE/pppm_dipole_spin.cpp b/src/KSPACE/pppm_dipole_spin.cpp
new file mode 100644
index 0000000000..878d40c82e
--- /dev/null
+++ b/src/KSPACE/pppm_dipole_spin.cpp
@@ -0,0 +1,755 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Julien Tranchida (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include "pppm_dipole_spin.h"
+#include "atom.h"
+#include "comm.h"
+#include "gridcomm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define LARGE 10000.0
+#define SMALL 0.00001
+#define EPS_HOC 1.0e-7
+
+enum{REVERSE_MU};
+enum{FORWARD_MU,FORWARD_MU_PERATOM};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDipoleSpin::PPPMDipoleSpin(LAMMPS *lmp) : 
+  PPPMDipole(lmp)
+{
+  dipoleflag = 0;
+  spinflag = 1;
+  
+  hbar = force->hplanck/MY_2PI;         	// eV/(rad.THz)
+  mub = 9.274e-4;                     		// in A.Ang^2
+  mu_0 = 785.15;               			// in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV.Ang^3
+  mub2mu0hbinv = mub2mu0 / hbar;        	// in rad.THz
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+PPPMDipoleSpin::~PPPMDipoleSpin()
+{
+  if (copymode) return;
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  fft1 = NULL;
+  fft2 = NULL;
+  remap = NULL;
+  cg_dipole = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::init()
+{
+  if (me == 0) {
+    if (screen) fprintf(screen,"PPPMDipoleSpin initialization ...\n");
+    if (logfile) fprintf(logfile,"PPPMDipoleSpin initialization ...\n");
+  }
+
+  // error check
+
+  spinflag = atom->sp?1:0;
+  
+  triclinic_check();
+
+  if (triclinic != domain->triclinic)
+    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");
+
+  if (domain->dimension == 2) error->all(FLERR,
+                                         "Cannot use PPPMDipoleSpin with 2d simulation");
+  if (comm->style != 0)
+    error->universe_all(FLERR,"PPPMDipoleSpin can only currently be used with "
+                        "comm_style brick");
+
+  if (!atom->sp) error->all(FLERR,"Kspace style requires atom attribute sp");
+
+  if (atom->sp && differentiation_flag == 1) error->all(FLERR,"Cannot (yet) use"
+     " kspace_modify diff ad with spins");
+
+  if (spinflag && strcmp(update->unit_style,"metal") != 0)
+    error->all(FLERR,"'metal' units have to be used with spins");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipoleSpin");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPMDipoleSpin");
+  }
+
+  if (order < 2 || order > MAXORDER) {
+    char str[128];
+    sprintf(str,"PPPMDipoleSpin order cannot be < 2 or > than %d",MAXORDER);
+    error->all(FLERR,str);
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot yet use triclinic cells with PPPMDipoleSpin");
+
+  pair_check();
+
+  int itmp = 0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  // check the correct extract here
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+  // qdist = offset only for TIP4P fictitious charge
+  
+  qdist = 0.0; 
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with PPPMDipoleSpin");
+
+  scale = 1.0;
+  spsum_spsq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  // is two_charge_force still relevant for spin systems?
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil does not extend beyond neighbor proc
+  //   or overlap is allowed, then done
+  // else reduce order and try again
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  GridComm *cgtmp = NULL;
+  int iteration = 0;
+    
+  while (order >= minorder) {
+    if (iteration && me == 0)
+      error->warning(FLERR,"Reducing PPPMDipoleSpin order b/c stencil extends "
+                     "beyond nearest neighbor processor");
+
+    compute_gf_denom();
+    set_grid_global();
+    set_grid_local();
+    if (overlap_allowed) break;
+
+    cgtmp = new GridComm(lmp,world,1,1,
+                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    cgtmp->ghost_notify();
+    if (!cgtmp->ghost_overlap()) break;
+    delete cgtmp;
+
+    order--;
+    iteration++;
+  }
+
+  if (order < minorder) error->all(FLERR,"PPPMDipoleSpin order < minimum allowed order");
+  if (!overlap_allowed && cgtmp->ghost_overlap())
+    error->all(FLERR,"PPPMDipoleSpin grid stencil extends "
+               "beyond nearest neighbor processor");
+  if (cgtmp) delete cgtmp;
+
+  // adjust g_ewald
+
+  if (!gewaldflag) adjust_gewald();
+
+  // calculate the final accuracy
+
+  double estimated_accuracy = final_accuracy_dipole();
+
+  // print stats
+
+  int ngrid_max,nfft_both_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+
+#ifdef FFT_SINGLE
+    const char fft_prec[] = "single";
+#else
+    const char fft_prec[] = "double";
+#endif
+
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(screen,"  stencil order = %d\n",order);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  using %s precision FFTs\n",fft_prec);
+      fprintf(screen,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(logfile,"  stencil order = %d\n",order);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  using %s precision FFTs\n",fft_prec);
+      fprintf(logfile,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+  }
+
+  // allocate K-space dependent memory
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+  cg_dipole->ghost_notify();
+  cg_dipole->setup();
+
+  // pre-compute Green's function denominator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMDipoleSpin long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  if (vflag_atom)
+    error->all(FLERR,"Cannot (yet) compute per-atom virial "
+                       "with kspace style pppm/dipole/spin");
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    cg_peratom_dipole->ghost_notify();
+    cg_peratom_dipole->setup();
+  }
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    spsum_spsq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no spins
+
+  if (musqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  boxlo = domain->boxlo;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm_spin:part2grid");
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d on-grid density
+
+  particle_map();
+  make_rho_spin();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  cg_dipole->reverse_comm(this,REVERSE_MU);
+  brick2fft_dipole();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atom calculations via poisson_peratom()
+
+  poisson_ik_dipole();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  cg_dipole->forward_comm(this,FORWARD_MU);
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) {
+    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
+  }
+
+  // calculate the force on my particles
+
+  fieldforce_ik_spin();
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom_spin();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  const double spscale = mub2mu0 * scale;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= spscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*spscale*volume*virial_all[i];
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    double **sp = atom->sp;
+    double spx,spy,spz;
+    int nlocal = atom->nlocal;
+    int ntotal = nlocal;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+	spx = sp[i][0]*sp[i][3];
+	spy = sp[i][1]*sp[i][3];
+	spz = sp[i][2]*sp[i][3];
+        eatom[i] *= 0.5;
+        eatom[i] -= (spx*spx + spy*spy + spz*spz)*2.0*g3/3.0/MY_PIS;
+        eatom[i] *= spscale;
+      }
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < ntotal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*spscale;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::make_rho_spin()
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR x1,y1,z1;
+  FFT_SCALAR x2,y2,z2;
+
+  // clear 3d density array
+
+  memset(&(densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    spx = sp[i][0]*sp[i][3];
+    spy = sp[i][1]*sp[i][3];
+    spz = sp[i][2]*sp[i][3];
+    z0 = delvolinv * spx;
+    z1 = delvolinv * spy;
+    z2 = delvolinv * spz;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*rho1d[2][n];
+      y1 = z1*rho1d[2][n];
+      y2 = z2*rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*rho1d[1][m];
+        x1 = y1*rho1d[1][m];
+        x2 = y2*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          densityx_brick_dipole[mz][my][mx] += x0*rho1d[0][l];
+          densityy_brick_dipole[mz][my][mx] += x1*rho1d[0][l];
+          densityz_brick_dipole[mz][my][mx] += x2*rho1d[0][l];
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get magnetic field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::fieldforce_ik_spin()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR ex,ey,ez;
+  FFT_SCALAR vxx,vyy,vzz,vxy,vxz,vyz;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm_long = atom->fm_long;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ex = ey = ez = ZEROF;
+    vxx = vyy = vzz = vxy = vxz = vyz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          ex -= x0*ux_brick_dipole[mz][my][mx];
+          ey -= x0*uy_brick_dipole[mz][my][mx];
+          ez -= x0*uz_brick_dipole[mz][my][mx];
+          vxx -= x0*vdxx_brick_dipole[mz][my][mx];
+          vyy -= x0*vdyy_brick_dipole[mz][my][mx];
+          vzz -= x0*vdzz_brick_dipole[mz][my][mx];
+          vxy -= x0*vdxy_brick_dipole[mz][my][mx];
+          vxz -= x0*vdxz_brick_dipole[mz][my][mx];
+          vyz -= x0*vdyz_brick_dipole[mz][my][mx];
+        }
+      }
+    }
+
+    // convert M-field and store mech. forces
+
+    const double spfactor = mub2mu0 * scale;
+    spx = sp[i][0]*sp[i][3];
+    spy = sp[i][1]*sp[i][3];
+    spz = sp[i][2]*sp[i][3];
+    f[i][0] += spfactor*(vxx*spx + vxy*spy + vxz*spz);
+    f[i][1] += spfactor*(vxy*spx + vyy*spy + vyz*spz);
+    f[i][2] += spfactor*(vxz*spx + vyz*spy + vzz*spz);
+
+    // store long-range mag. precessions
+    
+    const double spfactorh = mub2mu0hbinv * scale;
+    fm_long[i][0] += spfactorh*ex;
+    fm_long[i][1] += spfactorh*ey;
+    fm_long[i][2] += spfactorh*ez;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::fieldforce_peratom_spin()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ux,uy,uz;
+  FFT_SCALAR v0x,v1x,v2x,v3x,v4x,v5x;
+  FFT_SCALAR v0y,v1y,v2y,v3y,v4y,v5y;
+  FFT_SCALAR v0z,v1z,v2z,v3z,v4z,v5z;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  double **x = atom->x;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ux = uy = uz = ZEROF;
+    v0x = v1x = v2x = v3x = v4x = v5x = ZEROF;
+    v0y = v1y = v2y = v3y = v4y = v5y = ZEROF;
+    v0z = v1z = v2z = v3z = v4z = v5z = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          if (eflag_atom) {
+            ux += x0*ux_brick_dipole[mz][my][mx];
+            uy += x0*uy_brick_dipole[mz][my][mx];
+            uz += x0*uz_brick_dipole[mz][my][mx];
+          }
+          if (vflag_atom) {
+            v0x += x0*v0x_brick_dipole[mz][my][mx];
+            v1x += x0*v1x_brick_dipole[mz][my][mx];
+            v2x += x0*v2x_brick_dipole[mz][my][mx];
+            v3x += x0*v3x_brick_dipole[mz][my][mx];
+            v4x += x0*v4x_brick_dipole[mz][my][mx];
+            v5x += x0*v5x_brick_dipole[mz][my][mx];
+            v0y += x0*v0y_brick_dipole[mz][my][mx];
+            v1y += x0*v1y_brick_dipole[mz][my][mx];
+            v2y += x0*v2y_brick_dipole[mz][my][mx];
+            v3y += x0*v3y_brick_dipole[mz][my][mx];
+            v4y += x0*v4y_brick_dipole[mz][my][mx];
+            v5y += x0*v5y_brick_dipole[mz][my][mx];
+            v0z += x0*v0z_brick_dipole[mz][my][mx];
+            v1z += x0*v1z_brick_dipole[mz][my][mx];
+            v2z += x0*v2z_brick_dipole[mz][my][mx];
+            v3z += x0*v3z_brick_dipole[mz][my][mx];
+            v4z += x0*v4z_brick_dipole[mz][my][mx];
+            v5z += x0*v5z_brick_dipole[mz][my][mx];
+          }
+        }
+      }
+    }
+
+    spx = sp[i][0]*sp[i][3];
+    spy = sp[i][1]*sp[i][3];
+    spz = sp[i][2]*sp[i][3];
+    if (eflag_atom) eatom[i] += spx*ux + spy*uy + spz*uz;
+    if (vflag_atom) {
+      vatom[i][0] += spx*v0x + spy*v0y + spz*v0z;
+      vatom[i][1] += spx*v1x + spy*v1y + spz*v1z;
+      vatom[i][2] += spx*v2x + spy*v2y + spz*v2z;
+      vatom[i][3] += spx*v3x + spy*v3y + spz*v3z;
+      vatom[i][4] += spx*v4x + spy*v4y + spz*v4z;
+      vatom[i][5] += spx*v5x + spy*v5y + spz*v5z;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::slabcorr()
+{
+  // compute local contribution to global spin moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double spin = 0.0;
+  double **sp = atom->sp;
+  double spx,spy,spz;
+  for (int i = 0; i < nlocal; i++) { 
+    spz = sp[i][2]*sp[i][3];
+    spin += spz;
+  }
+
+  // sum local contributions to get global spin moment
+
+  double spin_all;
+  MPI_Allreduce(&spin,&spin_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(spin_all*spin_all/12.0)/volume;
+  const double spscale = mub2mu0 * scale;
+
+  if (eflag_global) energy += spscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = spscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++) {
+      spz = sp[i][2]*sp[i][3];
+      eatom[i] += efact * spz * spin_all;
+    }
+  }
+
+  // add on mag. force corrections
+
+  double ffact = spscale * (-4.0*MY_PI/volume);
+  double **fm_long = atom->fm_long;
+  for (int i = 0; i < nlocal; i++) {
+    fm_long[i][2] += ffact * spin_all;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute spsum,spsqsum,sp2
+   called initially, when particle count changes, when spins are changed
+------------------------------------------------------------------------- */
+
+void PPPMDipoleSpin::spsum_spsq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->sp_flag) {
+    double **sp = atom->sp;
+    double spx, spy, spz;
+    double spsum_local(0.0), spsqsum_local(0.0);
+
+    // sum (direction x norm) of all spins
+
+    for (int i = 0; i < nlocal; i++) {
+      spx = sp[i][0]*sp[i][3];
+      spy = sp[i][1]*sp[i][3];
+      spz = sp[i][2]*sp[i][3];
+      spsum_local += spx + spy + spz;
+      spsqsum_local += spx*spx + spy*spy + spz*spz;
+    }
+
+    // store results into pppm_dipole quantities 
+
+    MPI_Allreduce(&spsum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&spsqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    //mu2 = musqsum * mub2mu0;
+    mu2 = musqsum;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver PPPMDipoleSpin on system with no spins");
+}
diff --git a/src/KSPACE/pppm_dipole_spin.h b/src/KSPACE/pppm_dipole_spin.h
new file mode 100644
index 0000000000..2b4a989d5c
--- /dev/null
+++ b/src/KSPACE/pppm_dipole_spin.h
@@ -0,0 +1,176 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/dipole/spin,PPPMDipoleSpin)
+
+#else
+
+#ifndef LMP_PPPM_DIPOLE_SPIN_H
+#define LMP_PPPM_DIPOLE_SPIN_H
+
+#include "pppm_dipole.h"
+
+namespace LAMMPS_NS {
+
+class PPPMDipoleSpin : public PPPMDipole {
+ public:
+  PPPMDipoleSpin(class LAMMPS *);
+  virtual ~PPPMDipoleSpin();
+  void init();
+  void compute(int, int);
+
+ protected:
+  double hbar;                  // reduced Planck's constant      
+  double mub;                   // Bohr's magneton                
+  double mu_0;                  // vacuum permeability
+  double mub2mu0;               // prefactor for mech force
+  double mub2mu0hbinv;          // prefactor for mag force
+
+  void slabcorr();
+
+  // spin
+
+  void make_rho_spin();
+  void fieldforce_ik_spin();
+  void fieldforce_peratom_spin();
+  void spsum_spsq();
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Cannot (yet) use charges with Kspace style PPPMDipoleSpin
+
+Charge-spin interactions are not yet implemented in PPPMDipoleSpin so this
+feature is not yet supported.
+
+E: Must redefine kspace_style after changing to triclinic box
+
+Self-explanatory.
+
+E: Kspace style requires atom attribute mu
+
+The atom style defined does not have this attribute.
+
+E: Cannot (yet) use kspace_modify diff ad with spins
+
+This feature is not yet supported.
+
+E: Cannot (yet) use 'electron' units with spins
+
+This feature is not yet supported.
+
+E: Cannot yet use triclinic cells with PPPMDipoleSpin
+
+This feature is not yet supported.
+
+E: Cannot yet use TIP4P with PPPMDipoleSpin
+
+This feature is not yet supported.
+
+E: Cannot use nonperiodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range spin components be used.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired. See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2. This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: Cannot (yet) compute per-atom virial with kspace style pppm/dipole/spin
+
+This feature is not yet supported.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096. You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor. This is likely for one of two
+reasons, both of them bad. First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors. This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command. The safest settings are "delay 0
+every 1 check yes". Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+E: Using kspace solver PPPMDipoleSpin on system with no spins
+
+Must have non-zero spins with PPPMDipoleSpin.
+
+E: Must use kspace_modify gewald for system with no spins
+
+Self-explanatory.
+
+*/
diff --git a/src/SPIN/atom_vec_spin.cpp b/src/SPIN/atom_vec_spin.cpp
index a6c0430940..37d6fb1e59 100644
--- a/src/SPIN/atom_vec_spin.cpp
+++ b/src/SPIN/atom_vec_spin.cpp
@@ -48,7 +48,7 @@ AtomVecSpin::AtomVecSpin(LAMMPS *lmp) : AtomVec(lmp)
   comm_x_only = 0;
   comm_f_only = 0;
   size_forward = 7;
-  size_reverse = 6;
+  size_reverse = 9;
   size_border = 10;
   size_velocity = 3;
   size_data_atom = 9;
@@ -58,7 +58,6 @@ AtomVecSpin::AtomVecSpin(LAMMPS *lmp) : AtomVec(lmp)
   atom->sp_flag = 1;
 }
 
-
 /* ----------------------------------------------------------------------
    grow atom arrays
    n = 0 grows arrays by a chunk
@@ -88,6 +87,7 @@ void AtomVecSpin::grow(int n)
 
   sp = memory->grow(atom->sp,nmax,4,"atom:sp");
   fm = memory->grow(atom->fm,nmax*comm->nthreads,3,"atom:fm");
+  fm_long = memory->grow(atom->fm_long,nmax*comm->nthreads,3,"atom:fm_long");
 
   if (atom->nextra_grow)
     for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@@ -103,10 +103,9 @@ void AtomVecSpin::grow_reset()
   tag = atom->tag; type = atom->type;
   mask = atom->mask; image = atom->image;
   x = atom->x; v = atom->v; f = atom->f;
-  sp = atom->sp; fm = atom->fm;
+  sp = atom->sp; fm = atom->fm; fm_long = atom->fm_long;
 }
 
-
 /* ----------------------------------------------------------------------
    copy atom I info to atom J
 ------------------------------------------------------------------------- */
@@ -342,6 +341,9 @@ int AtomVecSpin::pack_reverse(int n, int first, double *buf)
     buf[m++] = fm[i][0];
     buf[m++] = fm[i][1];
     buf[m++] = fm[i][2];
+    buf[m++] = fm_long[i][0];
+    buf[m++] = fm_long[i][1];
+    buf[m++] = fm_long[i][2];
   }
 
   return m;
@@ -361,6 +363,9 @@ void AtomVecSpin::unpack_reverse(int n, int *list, double *buf)
     fm[j][0] += buf[m++];
     fm[j][1] += buf[m++];
     fm[j][2] += buf[m++];
+    fm_long[j][0] += buf[m++];
+    fm_long[j][1] += buf[m++];
+    fm_long[j][2] += buf[m++];
   }
 }
 
@@ -939,6 +944,7 @@ bigint AtomVecSpin::memory_usage()
 
   if (atom->memcheck("sp")) bytes += memory->usage(sp,nmax,4);
   if (atom->memcheck("fm")) bytes += memory->usage(fm,nmax*comm->nthreads,3);
+  if (atom->memcheck("fm_long")) bytes += memory->usage(fm_long,nmax*comm->nthreads,3);
 
   return bytes;
 }
@@ -951,6 +957,7 @@ void AtomVecSpin::force_clear(int /*n*/, size_t nbytes)
 {
   memset(&atom->f[0][0],0,3*nbytes);
   memset(&atom->fm[0][0],0,3*nbytes);
+  memset(&atom->fm_long[0][0],0,3*nbytes);
 }
 
 
diff --git a/src/SPIN/atom_vec_spin.h b/src/SPIN/atom_vec_spin.h
index 34bc55b99f..1f1c34df52 100644
--- a/src/SPIN/atom_vec_spin.h
+++ b/src/SPIN/atom_vec_spin.h
@@ -68,10 +68,12 @@ class AtomVecSpin : public AtomVec {
   int *type,*mask;
   imageint *image;
   double **x,**v,**f;		// lattice quantities
-  double **sp,**fm;		// spin quantities
-  				// sp[i][0-2] direction of the spin i
+  
+  				// spin quantities
+  double **sp;			// sp[i][0-2] direction of the spin i
 				// sp[i][3] atomic magnetic moment of the spin i
-
+  double **fm;			// fm[i][0-2] direction of magnetic precession
+  double **fm_long;		// storage of long-range spin prec. components
 };
 
 }
diff --git a/src/SPIN/fix_nve_spin.cpp b/src/SPIN/fix_nve_spin.cpp
index 9b67b29492..595ddb0cc2 100644
--- a/src/SPIN/fix_nve_spin.cpp
+++ b/src/SPIN/fix_nve_spin.cpp
@@ -129,6 +129,7 @@ FixNVESpin::FixNVESpin(LAMMPS *lmp, int narg, char **arg) :
   // initialize the magnetic interaction flags
 
   pair_spin_flag = 0;
+  long_spin_flag = 0;
   precession_spin_flag = 0;
   maglangevin_flag = 0;
   tdamp_flag = temp_flag = 0;
@@ -213,8 +214,16 @@ void FixNVESpin::init()
   if (count != npairspin)
     error->all(FLERR,"Incorrect number of spin pairs");
 
+  // set pair/spin and long/spin flags
+
   if (npairspin >= 1) pair_spin_flag = 1;
 
+  for (int i = 0; i<npairs; i++) {
+    if (force->pair_match("spin/long",0,i)) {
+      long_spin_flag = 1;
+    }
+  }
+
   // ptrs FixPrecessionSpin classes
 
   int iforce;
diff --git a/src/SPIN/fix_nve_spin.h b/src/SPIN/fix_nve_spin.h
index 4e135e3c2f..4800575c06 100644
--- a/src/SPIN/fix_nve_spin.h
+++ b/src/SPIN/fix_nve_spin.h
@@ -48,7 +48,6 @@ friend class PairSpin;
   int lattice_flag; 			// lattice_flag = 0 if spins only
   					// lattice_flag = 1 if spin-lattice calc.
 
-
  protected:
   int sector_flag;			// sector_flag = 0  if serial algorithm
   					// sector_flag = 1  if parallel algorithm
@@ -58,6 +57,7 @@ friend class PairSpin;
   int nlocal_max;			// max value of nlocal (for lists size)
 
   int pair_spin_flag;			// magnetic pair flags
+  int long_spin_flag;			// magnetic long-range flag
   int precession_spin_flag;		// magnetic precession flags
   int maglangevin_flag;			// magnetic langevin flags
   int tdamp_flag, temp_flag;
diff --git a/src/SPIN/pair_spin_dipole_cut.cpp b/src/SPIN/pair_spin_dipole_cut.cpp
new file mode 100644
index 0000000000..4ff198488a
--- /dev/null
+++ b/src/SPIN/pair_spin_dipole_cut.cpp
@@ -0,0 +1,537 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ------------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+                         Stan Moore (SNL)
+
+   Please cite the related publication:
+   Tranchida, J., Plimpton, S. J., Thibaudeau, P., & Thompson, A. P. (2018).
+   Massively parallel symplectic algorithm for coupled magnetic spin dynamics
+   and molecular dynamics. Journal of Computational Physics.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_spin_dipole_cut.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "fix_nve_spin.h"
+#include "force.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "error.h"
+#include "update.h"
+
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+/* ---------------------------------------------------------------------- */
+
+PairSpinDipoleCut::PairSpinDipoleCut(LAMMPS *lmp) : PairSpin(lmp),
+lockfixnvespin(NULL)
+{
+  single_enable = 0;
+  spinflag = 1;
+  respa_enable = 0;
+  no_virial_fdotr_compute = 1;
+  lattice_flag = 0;
+
+  hbar = force->hplanck/MY_2PI;			// eV/(rad.THz)
+  mub = 9.274e-4;                               // in A.Ang^2
+  mu_0 = 785.15;                                // in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);     // in eV.Ang^3
+  //mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV
+  mub2mu0hbinv = mub2mu0 / hbar;		// in rad.THz
+
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairSpinDipoleCut::~PairSpinDipoleCut()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cut_spin_long);
+    memory->destroy(cutsq);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::settings(int narg, char **arg)
+{
+  if (narg < 1 || narg > 2)
+    error->all(FLERR,"Incorrect args in pair_style command");
+
+  if (strcmp(update->unit_style,"metal") != 0)
+    error->all(FLERR,"Spin simulations require metal unit style");
+
+  if (!atom->sp) 
+    error->all(FLERR,"Pair/spin style requires atom attribute sp");
+
+  cut_spin_long_global = force->numeric(FLERR,arg[0]);
+  
+  // reset cutoffs that have been explicitly set
+  
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++) {
+      for (j = i+1; j <= atom->ntypes; j++) {
+        if (setflag[i][j]) {
+          cut_spin_long[i][j] = cut_spin_long_global;
+        }
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::coeff(int narg, char **arg)
+{
+  if (!allocated) allocate();
+ 
+  if (narg != 3) 
+    error->all(FLERR,"Incorrect args in pair_style command");
+  
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  double spin_long_cut_one = force->numeric(FLERR,arg[2]);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      setflag[i][j] = 1;
+      cut_spin_long[i][j] = spin_long_cut_one;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::init_style()
+{
+  if (!atom->sp_flag)
+    error->all(FLERR,"Pair spin requires atom/spin style");
+  
+  // need a full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  
+  // checking if nve/spin or neb/spin are a listed fix
+
+  int ifix = 0;
+  while (ifix < modify->nfix) {
+    if (strcmp(modify->fix[ifix]->style,"nve/spin") == 0) break;
+    if (strcmp(modify->fix[ifix]->style,"neb/spin") == 0) break;
+    ifix++;
+  }
+  if ((ifix == modify->nfix) && (comm->me == 0))
+    error->warning(FLERR,"Using pair/spin style without nve/spin or neb/spin");
+
+  // get the lattice_flag from nve/spin
+
+  for (int i = 0; i < modify->nfix; i++) {
+    if (strcmp(modify->fix[i]->style,"nve/spin") == 0) {
+      lockfixnvespin = (FixNVESpin *) modify->fix[i];
+      lattice_flag = lockfixnvespin->lattice_flag;
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSpinDipoleCut::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  
+  cut_spin_long[j][i] = cut_spin_long[i][j];
+  
+  return cut_spin_long_global;
+}
+
+/* ----------------------------------------------------------------------
+   extract the larger cutoff if "cut" or "cut_coul"
+------------------------------------------------------------------------- */
+
+void *PairSpinDipoleCut::extract(const char *str, int &dim)
+{
+  if (strcmp(str,"cut") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"cut_coul") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"ewald_order") == 0) {
+    ewald_order = 0;
+    ewald_order |= 1<<1;
+    ewald_order |= 1<<3;
+    dim = 0;
+    return (void *) &ewald_order;
+  } else if (strcmp(str,"ewald_mix") == 0) {
+    dim = 0;
+    return (void *) &mix_flag;
+  }
+  return NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;  
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+  double rinv,r2inv,r3inv,rsq,local_cut2,evdwl,ecoul;
+  double xi[3],rij[3],eij[3],spi[4],spj[4],fi[3],fmi[3];
+
+  evdwl = ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  int *type = atom->type;  
+  int nlocal = atom->nlocal;  
+  int newton_pair = force->newton_pair;
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm = atom->fm;
+  double **sp = atom->sp;	
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // computation of the exchange interaction
+  // loop over atoms and their neighbors
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xi[0] = x[i][0];
+    xi[1] = x[i][1];
+    xi[2] = x[i][2];
+    jlist = firstneigh[i];
+    jnum = numneigh[i]; 
+    spi[0] = sp[i][0]; 
+    spi[1] = sp[i][1]; 
+    spi[2] = sp[i][2];
+    spi[3] = sp[i][3];
+    itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      evdwl = 0.0;
+      fi[0] = fi[1] = fi[2] = 0.0;
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+     
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+	r3inv = r2inv*rinv;
+	
+	compute_dipolar(i,j,eij,fmi,spi,spj,r3inv);
+	if (lattice_flag) compute_dipolar_mech(i,j,eij,fi,spi,spj,r2inv);
+      }
+
+      // force accumulation
+
+      f[i][0] += fi[0]; 
+      f[i][1] += fi[1];  	  
+      f[i][2] += fi[2];
+      fm[i][0] += fmi[0];
+      fm[i][1] += fmi[1];
+      fm[i][2] += fmi[2];
+
+      if (newton_pair || j < nlocal) {
+	f[j][0] -= fi[0];	 
+        f[j][1] -= fi[1];	  	  
+        f[j][2] -= fi[2];
+      }
+
+      if (eflag) {
+	if (rsq <= local_cut2) {
+	  evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+	  evdwl *= hbar;
+	}
+      } else evdwl = 0.0;
+
+      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+	  evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   update the pair interaction fmi acting on the spin ii
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute_single_pair(int ii, double fmi[3])
+{
+  int j,jnum,itype,jtype,ntypes; 
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+  double rsq,rinv,r2inv,r3inv,local_cut2;
+  double xi[3],rij[3],eij[3],spi[4],spj[4];
+
+  int k,locflag;
+  int *type = atom->type;  
+  double **x = atom->x;
+  double **sp = atom->sp;	
+
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // check if interaction applies to type of ii
+  
+  itype = type[ii];
+  ntypes = atom->ntypes;
+  locflag = 0;
+  k = 1;
+  while (k <= ntypes) {
+    if (k <= itype) {
+      if (setflag[k][itype] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else if (k > itype) {
+      if (setflag[itype][k] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else error->all(FLERR,"Wrong type number");
+  }
+
+
+  // if interaction applies to type ii,
+  // locflag = 1 and compute pair interaction
+ 
+  if (locflag == 1) {
+
+    xi[0] = x[ii][0];
+    xi[1] = x[ii][1];
+    xi[2] = x[ii][2];
+    spi[0] = sp[ii][0]; 
+    spi[1] = sp[ii][1]; 
+    spi[2] = sp[ii][2];
+    spi[3] = sp[ii][3];
+    jlist = firstneigh[ii];
+    jnum = numneigh[ii]; 
+    
+    for (int jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+        r3inv = r2inv*rinv;
+        
+        // compute dipolar interaction
+        
+        compute_dipolar(ii,j,eij,fmi,spi,spj,r3inv);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute dipolar interaction between spins i and j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute_dipolar(int i, int j, double eij[3], 
+    double fmi[3], double spi[4], double spj[4], double r3inv)
+{
+  double sjdotr;
+  double gigjiri3,pre;
+
+  sjdotr = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+  gigjiri3 = (spi[3] * spj[3])*r3inv;
+  pre = mub2mu0hbinv * gigjiri3;
+
+  fmi[0] += pre * (3.0 * sjdotr *eij[0] - spj[0]);
+  fmi[1] += pre * (3.0 * sjdotr *eij[1] - spj[1]);
+  fmi[2] += pre * (3.0 * sjdotr *eij[2] - spj[2]);
+}
+
+/* ----------------------------------------------------------------------
+   compute the mechanical force due to the dipolar interaction between 
+   atom i and atom j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::compute_dipolar_mech(int i, int j, double eij[3],
+    double fi[3], double spi[3], double spj[3], double r2inv)
+{
+  double sisj,sieij,sjeij;
+  double gigjri4,bij,pre;
+
+  gigjri4 = (spi[3] * spj[3])*r2inv*r2inv;
+  sisj = spi[0]*spj[0] + spi[1]*spj[1] + spi[2]*spj[2];
+  sieij = spi[0]*eij[0] + spi[1]*eij[1] + spi[2]*eij[2];
+  sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+  
+  bij = sisj - 5.0*sieij*sjeij;
+  pre = 3.0*mub2mu0*gigjri4;
+
+  fi[0] -= pre * (eij[0] * bij + (sjeij*spi[0] + sieij*spj[0]));
+  fi[1] -= pre * (eij[1] * bij + (sjeij*spi[1] + sieij*spj[1]));
+  fi[2] -= pre * (eij[2] * bij + (sjeij*spi[2] + sieij*spj[2]));
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cut_spin_long,n+1,n+1,"pair/spin/long:cut_spin_long");
+  memory->create(cutsq,n+1,n+1,"pair/spin/long:cutsq");
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+	fwrite(&cut_spin_long[i][j],sizeof(int),1,fp);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+	if (me == 0) {
+	  fread(&cut_spin_long[i][j],sizeof(int),1,fp);
+	}
+	MPI_Bcast(&cut_spin_long[i][j],1,MPI_INT,0,world);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::write_restart_settings(FILE *fp)
+{
+  fwrite(&cut_spin_long_global,sizeof(double),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleCut::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&cut_spin_long_global,sizeof(double),1,fp);
+    fread(&mix_flag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&cut_spin_long_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
diff --git a/src/SPIN/pair_spin_dipole_cut.h b/src/SPIN/pair_spin_dipole_cut.h
new file mode 100644
index 0000000000..77e452b179
--- /dev/null
+++ b/src/SPIN/pair_spin_dipole_cut.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(spin/dipole/cut,PairSpinDipoleCut)
+
+#else
+
+#ifndef LMP_PAIR_SPIN_DIPOLE_CUT_H
+#define LMP_PAIR_SPIN_DIPOLE_CUT_H
+
+#include "pair_spin.h"
+
+namespace LAMMPS_NS {
+
+class PairSpinDipoleCut : public PairSpin {
+ public:
+  double cut_coul;
+  double **sigma;
+
+  PairSpinDipoleCut(class LAMMPS *);
+  ~PairSpinDipoleCut();
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void init_style();
+  void *extract(const char *, int &); 
+  
+  void compute(int, int);
+  void compute_single_pair(int, double *);
+
+  void compute_dipolar(int, int, double *, double *, double *, 
+      double *, double);
+  void compute_dipolar_mech(int, int, double *, double *, double *, 
+      double *, double);
+
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  
+  double cut_spin_long_global;	// global long cutoff distance 
+
+ protected:
+  double hbar;	 		// reduced Planck's constant
+  double mub;			// Bohr's magneton
+  double mu_0;			// vacuum permeability
+  double mub2mu0;		// prefactor for mech force
+  double mub2mu0hbinv;		// prefactor for mag force
+
+  double **cut_spin_long;	// cutoff distance long
+
+  double g_ewald;
+  int ewald_order;
+
+  int lattice_flag;			// flag for mech force computation
+  class FixNVESpin *lockfixnvespin;	// ptr for setups
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args in pair_style command
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair dipole/long requires atom attributes q, mu, torque
+
+The atom style defined does not have these attributes.
+
+E: Cannot (yet) use 'electron' units with dipoles
+
+This feature is not yet supported.
+
+E: Pair style requires a KSpace style
+
+No kspace style is defined.
+
+*/
diff --git a/src/SPIN/pair_spin_dipole_long.cpp b/src/SPIN/pair_spin_dipole_long.cpp
new file mode 100644
index 0000000000..e3575a6a07
--- /dev/null
+++ b/src/SPIN/pair_spin_dipole_long.cpp
@@ -0,0 +1,607 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ------------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+			 Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+
+#include "pair_spin_dipole_long.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "fix_nve_spin.h"
+#include "force.h"
+#include "kspace.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "error.h"
+#include "update.h"
+
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairSpinDipoleLong::PairSpinDipoleLong(LAMMPS *lmp) : PairSpin(lmp),
+lockfixnvespin(NULL)
+{
+  single_enable = 0;
+  ewaldflag = pppmflag = spinflag = 1;
+  respa_enable = 0;
+  no_virial_fdotr_compute = 1;
+  lattice_flag = 0;
+
+  hbar = force->hplanck/MY_2PI;			// eV/(rad.THz)
+  mub = 9.274e-4;                               // in A.Ang^2
+  mu_0 = 785.15;                                // in eV/Ang/A^2
+  mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);     // in eV.Ang^3
+  //mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);	// in eV
+  mub2mu0hbinv = mub2mu0 / hbar;		// in rad.THz
+
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairSpinDipoleLong::~PairSpinDipoleLong()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cut_spin_long);
+    memory->destroy(cutsq);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::settings(int narg, char **arg)
+{
+  if (narg < 1 || narg > 2)
+    error->all(FLERR,"Incorrect args in pair_style command");
+
+  if (strcmp(update->unit_style,"metal") != 0)
+    error->all(FLERR,"Spin simulations require metal unit style");
+
+  cut_spin_long_global = force->numeric(FLERR,arg[0]);
+  
+  // reset cutoffs that have been explicitly set
+  
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++) {
+      for (j = i+1; j <= atom->ntypes; j++) {
+        if (setflag[i][j]) {
+          cut_spin_long[i][j] = cut_spin_long_global;
+        }
+      }
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::coeff(int narg, char **arg)
+{
+  if (!allocated) allocate();
+  
+  if (narg != 3)
+    error->all(FLERR,"Incorrect args in pair_style command");
+  
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  double spin_long_cut_one = force->numeric(FLERR,arg[2]);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      setflag[i][j] = 1;
+      cut_spin_long[i][j] = spin_long_cut_one;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::init_style()
+{
+  if (!atom->sp_flag)
+    error->all(FLERR,"Pair spin requires atom/spin style");
+  
+  // need a full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  
+  // checking if nve/spin or neb/spin are a listed fix
+
+  int ifix = 0;
+  while (ifix < modify->nfix) {
+    if (strcmp(modify->fix[ifix]->style,"nve/spin") == 0) break;
+    if (strcmp(modify->fix[ifix]->style,"neb/spin") == 0) break;
+    ifix++;
+  }
+  if ((ifix == modify->nfix) && (comm->me == 0))
+    error->warning(FLERR,"Using pair/spin style without nve/spin or neb/spin");
+
+  // get the lattice_flag from nve/spin
+
+  for (int i = 0; i < modify->nfix; i++) {
+    if (strcmp(modify->fix[i]->style,"nve/spin") == 0) {
+      lockfixnvespin = (FixNVESpin *) modify->fix[i];
+      lattice_flag = lockfixnvespin->lattice_flag;
+    }
+  }
+
+  // insure use of KSpace long-range solver, set g_ewald
+
+  if (force->kspace == NULL)
+    error->all(FLERR,"Pair style requires a KSpace style");
+
+  g_ewald = force->kspace->g_ewald;
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSpinDipoleLong::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  
+  cut_spin_long[j][i] = cut_spin_long[i][j];
+  
+  return cut_spin_long_global;
+}
+
+/* ----------------------------------------------------------------------
+   extract the larger cutoff if "cut" or "cut_coul"
+------------------------------------------------------------------------- */
+
+void *PairSpinDipoleLong::extract(const char *str, int &dim)
+{
+  if (strcmp(str,"cut") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"cut_coul") == 0) {
+    dim = 0;
+    return (void *) &cut_spin_long_global;
+  } else if (strcmp(str,"ewald_order") == 0) {
+    ewald_order = 0;
+    ewald_order |= 1<<1;
+    ewald_order |= 1<<3;
+    dim = 0;
+    return (void *) &ewald_order;
+  } else if (strcmp(str,"ewald_mix") == 0) {
+    dim = 0;
+    return (void *) &mix_flag;
+  }
+  return NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;  
+  double r,rinv,r2inv,rsq;
+  double grij,expm2,t,erfc;
+  double evdwl,ecoul;
+  double bij[4];
+  double xi[3],rij[3],eij[3];
+  double spi[4],spj[4];
+  double fi[3],fmi[3];
+  double local_cut2;
+  double pre1,pre2,pre3;
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+
+  evdwl = ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm = atom->fm;
+  double **sp = atom->sp;	
+  int *type = atom->type;  
+  int nlocal = atom->nlocal;  
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  pre1 = 2.0 * g_ewald / MY_PIS;
+  pre2 = 4.0 * pow(g_ewald,3.0) / MY_PIS;
+  pre3 = 8.0 * pow(g_ewald,5.0) / MY_PIS;
+
+  // computation of the exchange interaction
+  // loop over atoms and their neighbors
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xi[0] = x[i][0];
+    xi[1] = x[i][1];
+    xi[2] = x[i][2];
+    jlist = firstneigh[i];
+    jnum = numneigh[i]; 
+    spi[0] = sp[i][0]; 
+    spi[1] = sp[i][1]; 
+    spi[2] = sp[i][2];
+    spi[3] = sp[i][3];
+    itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      evdwl = 0.0;
+
+      fi[0] = fi[1] = fi[2] = 0.0;
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+      bij[0] = bij[1] = bij[2] = bij[3] = 0.0;
+     
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+
+        r = sqrt(rsq);
+        grij = g_ewald * r;
+        expm2 = exp(-grij*grij);
+        t = 1.0 / (1.0 + EWALD_P*grij);
+        erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+        bij[0] = erfc * rinv;
+        bij[1] = (bij[0] + pre1*expm2) * r2inv;
+        bij[2] = (3.0*bij[1] + pre2*expm2) * r2inv;
+        bij[3] = (5.0*bij[2] + pre3*expm2) * r2inv;
+
+	compute_long(i,j,eij,bij,fmi,spi,spj);
+	compute_long_mech(i,j,eij,bij,fmi,spi,spj);
+      }
+
+      // force accumulation
+
+      f[i][0] += fi[0];	 
+      f[i][1] += fi[1];	  	  
+      f[i][2] += fi[2];
+      fm[i][0] += fmi[0];	 
+      fm[i][1] += fmi[1];	  	  
+      fm[i][2] += fmi[2];
+
+      if (newton_pair || j < nlocal) {
+	f[j][0] -= fi[0];	 
+        f[j][1] -= fi[1];	  	  
+        f[j][2] -= fi[2];
+      }
+
+      if (eflag) {
+	if (rsq <= local_cut2) {
+	  evdwl -= spi[0]*fmi[0] + spi[1]*fmi[1] + 
+	    spi[2]*fmi[2];
+	  evdwl *= hbar;
+	}
+      } else evdwl = 0.0;
+
+
+      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+	  evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   update the pair interaction fmi acting on the spin ii
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute_single_pair(int ii, double fmi[3])
+{
+  //int i,j,jj,jnum,itype,jtype;  
+  int j,jj,jnum,itype,jtype,ntypes; 
+  int k,locflag;
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+  double r,rinv,r2inv,rsq,grij,expm2,t,erfc;
+  double local_cut2,pre1,pre2,pre3;
+  double bij[4],xi[3],rij[3],eij[3],spi[4],spj[4];
+
+  int *type = atom->type;  
+  double **x = atom->x;
+  double **sp = atom->sp;	
+  double **fm_long = atom->fm_long;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+  
+  // check if interaction applies to type of ii
+
+  itype = type[ii];
+  ntypes = atom->ntypes;
+  locflag = 0;
+  k = 1;
+  while (k <= ntypes) {
+    if (k <= itype) {
+      if (setflag[k][itype] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else if (k > itype) {
+      if (setflag[itype][k] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else error->all(FLERR,"Wrong type number");
+  }
+
+  // if interaction applies to type ii,
+  // locflag = 1 and compute pair interaction
+
+  if (locflag == 1) {
+
+    pre1 = 2.0 * g_ewald / MY_PIS;
+    pre2 = 4.0 * pow(g_ewald,3.0) / MY_PIS;
+    pre3 = 8.0 * pow(g_ewald,5.0) / MY_PIS;
+
+    // computation of the exchange interaction
+    // loop over neighbors of atom i
+      
+    //i = ilist[ii];
+    xi[0] = x[ii][0];
+    xi[1] = x[ii][1];
+    xi[2] = x[ii][2];
+    spi[0] = sp[ii][0]; 
+    spi[1] = sp[ii][1]; 
+    spi[2] = sp[ii][2];
+    spi[3] = sp[ii][3];
+    jlist = firstneigh[ii];
+    jnum = numneigh[ii]; 
+    //itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+      bij[0] = bij[1] = bij[2] = bij[3] = 0.0;
+     
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+      rinv = 1.0/sqrt(rsq);
+      eij[0] = rij[0]*rinv;
+      eij[1] = rij[1]*rinv;
+      eij[2] = rij[2]*rinv;
+
+      local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
+
+      if (rsq < local_cut2) {
+        r2inv = 1.0/rsq;
+
+        r = sqrt(rsq);
+        grij = g_ewald * r;
+        expm2 = exp(-grij*grij);
+        t = 1.0 / (1.0 + EWALD_P*grij);
+        erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+        bij[0] = erfc * rinv;
+        bij[1] = (bij[0] + pre1*expm2) * r2inv;
+        bij[2] = (3.0*bij[1] + pre2*expm2) * r2inv;
+        bij[3] = (5.0*bij[2] + pre3*expm2) * r2inv;
+
+        compute_long(ii,j,eij,bij,fmi,spi,spj);
+      }
+    }
+
+    // adding the kspace components to fm
+    
+    fmi[0] += fm_long[ii][0];
+    fmi[1] += fm_long[ii][1];
+    fmi[2] += fm_long[ii][2];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute dipolar interaction between spins i and j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute_long(int i, int j, double eij[3], 
+    double bij[4], double fmi[3], double spi[4], double spj[4])
+{
+  double sjeij,pre;
+  double b1,b2,gigj;
+
+  gigj = spi[3] * spj[3];
+  pre = gigj*mub2mu0hbinv;
+  sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+
+  b1 = bij[1];
+  b2 = bij[2];
+
+  fmi[0] += pre * (b2 * sjeij * eij[0] - b1 * spj[0]);
+  fmi[1] += pre * (b2 * sjeij * eij[1] - b1 * spj[1]);
+  fmi[2] += pre * (b2 * sjeij * eij[2] - b1 * spj[2]);
+}
+
+/* ----------------------------------------------------------------------
+   compute the mechanical force due to the dipolar interaction between 
+   atom i and atom j
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::compute_long_mech(int i, int j, double eij[3],
+    double bij[4], double fi[3], double spi[3], double spj[3])
+{
+  double sisj,sieij,sjeij,b2,b3;
+  double g1,g2,g1b2_g2b3,gigj,pre;
+
+  gigj = spi[3] * spj[3];
+  pre = gigj*mub2mu0;
+  sisj = spi[0]*spj[0] + spi[1]*spj[1] + spi[2]*spj[2];
+  sieij = spi[0]*eij[0] + spi[1]*eij[1] + spi[2]*eij[2];
+  sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
+
+  b2 = bij[2];
+  b3 = bij[3];
+  g1 = sisj;
+  g2 = -sieij*sjeij;
+  g1b2_g2b3 = g1*b2 + g2*b3;
+
+  fi[0] += pre * (eij[0] * g1b2_g2b3 + b2 * (sjeij*spi[0] + sieij*spj[0]));
+  fi[1] += pre * (eij[1] * g1b2_g2b3 + b2 * (sjeij*spi[1] + sieij*spj[1]));
+  fi[2] += pre * (eij[2] * g1b2_g2b3 + b2 * (sjeij*spi[2] + sieij*spj[2]));
+}
+
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cut_spin_long,n+1,n+1,"pair/spin/long:cut_spin_long");
+  memory->create(cutsq,n+1,n+1,"pair/spin/long:cutsq");
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+	fwrite(&cut_spin_long[i][j],sizeof(int),1,fp);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+	if (me == 0) {
+	  fread(&cut_spin_long[i][j],sizeof(int),1,fp);
+	}
+	MPI_Bcast(&cut_spin_long[i][j],1,MPI_INT,0,world);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::write_restart_settings(FILE *fp)
+{
+  fwrite(&cut_spin_long_global,sizeof(double),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinDipoleLong::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&cut_spin_long_global,sizeof(double),1,fp);
+    fread(&mix_flag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&cut_spin_long_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
diff --git a/src/SPIN/pair_spin_dipole_long.h b/src/SPIN/pair_spin_dipole_long.h
new file mode 100644
index 0000000000..6a05f56032
--- /dev/null
+++ b/src/SPIN/pair_spin_dipole_long.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(spin/dipole/long,PairSpinDipoleLong)
+
+#else
+
+#ifndef LMP_PAIR_SPIN_DIPOLE_LONG_H
+#define LMP_PAIR_SPIN_DIPOLE_LONG_H
+
+#include "pair_spin.h"
+
+namespace LAMMPS_NS {
+
+class PairSpinDipoleLong : public PairSpin {
+ public:
+  double cut_coul;
+  double **sigma;
+
+  PairSpinDipoleLong(class LAMMPS *);
+  ~PairSpinDipoleLong();
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void init_style();
+  void *extract(const char *, int &); 
+  
+  void compute(int, int);
+  void compute_single_pair(int, double *);
+
+  void compute_long(int, int, double *, double *, double *, 
+      double *, double *);
+  void compute_long_mech(int, int, double *, double *, double *, 
+      double *, double *);
+
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  
+  double cut_spin_long_global;	// global long cutoff distance 
+
+ protected:
+  double hbar;	 		// reduced Planck's constant
+  double mub;			// Bohr's magneton
+  double mu_0;			// vacuum permeability
+  double mub2mu0;		// prefactor for mech force
+  double mub2mu0hbinv;		// prefactor for mag force
+
+  double **cut_spin_long;	// cutoff distance long
+
+  double g_ewald;
+  int ewald_order;
+
+  int lattice_flag;			// flag for mech force computation
+  class FixNVESpin *lockfixnvespin;	// ptr for setups
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args in pair_style command
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair dipole/long requires atom attributes q, mu, torque
+
+The atom style defined does not have these attributes.
+
+E: Can only use 'metal' units with spins
+
+This feature is not yet supported.
+
+E: Pair style requires a KSpace style
+
+No kspace style is defined.
+
+*/
diff --git a/src/SPIN/pair_spin_exchange.cpp b/src/SPIN/pair_spin_exchange.cpp
index 0260a611cf..93b6d9501e 100644
--- a/src/SPIN/pair_spin_exchange.cpp
+++ b/src/SPIN/pair_spin_exchange.cpp
@@ -75,7 +75,7 @@ PairSpinExchange::~PairSpinExchange()
 
 void PairSpinExchange::settings(int narg, char **arg)
 {
-  if (narg < 1 || narg > 2)
+  if (narg < 1 || narg > 7) 
     error->all(FLERR,"Incorrect number of args in pair_style pair/spin command");
 
   if (strcmp(update->unit_style,"metal") != 0)
@@ -464,8 +464,6 @@ void PairSpinExchange::allocate()
   memory->create(cutsq,n+1,n+1,"pair:cutsq");
 }
 
-
-
 /* ----------------------------------------------------------------------
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
@@ -550,5 +548,3 @@ void PairSpinExchange::read_restart_settings(FILE *fp)
   MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
   MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
 }
-
-
diff --git a/src/atom.cpp b/src/atom.cpp
index dfea9348c9..a53f35d7b3 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -100,7 +100,7 @@ Atom::Atom(LAMMPS *lmp) : Pointers(lmp)
 
   // SPIN package
 
-  sp = fm = NULL;
+  sp = fm = fm_long = NULL;
 
   // USER-DPD
 
@@ -279,6 +279,7 @@ Atom::~Atom()
 
   memory->destroy(sp);
   memory->destroy(fm);
+  memory->destroy(fm_long);
 
   memory->destroy(vfrac);
   memory->destroy(s0);
diff --git a/src/atom.h b/src/atom.h
index e7c4137959..5cd9ca819b 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -69,6 +69,7 @@ class Atom : protected Pointers {
 
   double **sp;
   double **fm;
+  double **fm_long;
 
   // PERI package
 
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 64a769fa51..643cf22113 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -37,7 +37,7 @@ KSpace::KSpace(LAMMPS *lmp) : Pointers(lmp)
   virial[0] = virial[1] = virial[2] = virial[3] = virial[4] = virial[5] = 0.0;
 
   triclinic_support = 1;
-  ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = dipoleflag = 0;
+  ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = dipoleflag = spinflag = 0;
   compute_flag = 1;
   group_group_enable = 0;
   stagger_flag = 0;
@@ -190,6 +190,8 @@ void KSpace::pair_check()
     error->all(FLERR,"KSpace style is incompatible with Pair style");
   if (dipoleflag && !force->pair->dipoleflag)
     error->all(FLERR,"KSpace style is incompatible with Pair style");
+  if (spinflag && !force->pair->spinflag)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
   if (tip4pflag && !force->pair->tip4pflag)
     error->all(FLERR,"KSpace style is incompatible with Pair style");
 
@@ -266,7 +268,7 @@ void KSpace::ev_setup(int eflag, int vflag, int alloc)
    called initially, when particle count changes, when charges are changed
 ------------------------------------------------------------------------- */
 
-void KSpace::qsum_qsq()
+void KSpace::qsum_qsq(int warning_flag)
 {
   const double * const q = atom->q;
   const int nlocal = atom->nlocal;
@@ -283,7 +285,7 @@ void KSpace::qsum_qsq()
   MPI_Allreduce(&qsum_local,&qsum,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&qsqsum_local,&qsqsum,1,MPI_DOUBLE,MPI_SUM,world);
 
-  if ((qsqsum == 0.0) && (comm->me == 0) && warn_nocharge) {
+  if ((qsqsum == 0.0) && (comm->me == 0) && warn_nocharge && warning_flag) {
     error->warning(FLERR,"Using kspace solver on system with no charge");
     warn_nocharge = 0;
   }
diff --git a/src/kspace.h b/src/kspace.h
index 4fd260e186..69f4ccb433 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -44,6 +44,7 @@ class KSpace : protected Pointers {
   int dispersionflag;            // 1 if a LJ/dispersion solver
   int tip4pflag;                 // 1 if a TIP4P solver
   int dipoleflag;                // 1 if a dipole solver
+  int spinflag;			 // 1 if a spin solver
   int differentiation_flag;
   int neighrequest_flag;         // used to avoid obsolete construction
                                  // of neighbor lists
@@ -108,7 +109,7 @@ class KSpace : protected Pointers {
 
   // public so can be called by commands that change charge
 
-  void qsum_qsq();
+  void qsum_qsq(int warning_flag = 1);
 
   // general child-class methods
 
diff --git a/src/pair.cpp b/src/pair.cpp
index 20bc35fb01..92b5d003a8 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -75,7 +75,7 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp)
   setflag = NULL;
   cutsq = NULL;
 
-  ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = dipoleflag = 0;
+  ewaldflag = pppmflag = msmflag = dispersionflag = tip4pflag = dipoleflag = spinflag = 0;
   reinitflag = 1;
 
   // pair_modify settings
diff --git a/src/pair.h b/src/pair.h
index d766a20ef4..7481514dae 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -61,6 +61,7 @@ class Pair : protected Pointers {
   int dispersionflag;            // 1 if compatible with LJ/dispersion solver
   int tip4pflag;                 // 1 if compatible with TIP4P solver
   int dipoleflag;                // 1 if compatible with dipole solver
+  int spinflag;			 // 1 if compatible with spin solver
   int reinitflag;                // 1 if compatible with fix adapt and alike
 
   int tail_flag;                 // pair_modify flag for LJ tail correction
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index a1a6c81195..4dcd249ac6 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -358,6 +358,7 @@ void PairHybrid::flags()
     if (styles[m]->pppmflag) pppmflag = 1;
     if (styles[m]->msmflag) msmflag = 1;
     if (styles[m]->dipoleflag) dipoleflag = 1;
+    if (styles[m]->spinflag) spinflag = 1;
     if (styles[m]->dispersionflag) dispersionflag = 1;
     if (styles[m]->tip4pflag) tip4pflag = 1;
     if (styles[m]->compute_flag) compute_flag = 1;