diff --git a/doc/html/Section_howto.html b/doc/html/Section_howto.html
index 35b5419bc5..c168a5bd84 100644
--- a/doc/html/Section_howto.html
+++ b/doc/html/Section_howto.html
@@ -1086,7 +1086,7 @@ the sheared fluid and integrate the SLLOD equations of motion for the
 system.  Fix nvt/sllod uses <a class="reference internal" href="compute_temp_deform.html"><span class="doc">compute temp/deform</span></a> to compute a thermal temperature
 by subtracting out the streaming velocity of the shearing atoms.  The
 velocity profile or other properties of the fluid can be monitored via
-the <a class="reference internal" href="fix_ave_chunk.html"><span class="doc">fix ave/chunk</span></a> command.</p>
+the <span class="xref doc">fix ave/spatial</span> command.</p>
 <p>As discussed in the previous section on non-orthogonal simulation
 boxes, the amount of tilt or skew that can be applied is limited by
 LAMMPS for computational efficiency to be 1/2 of the parallel box
@@ -2011,13 +2011,13 @@ on each of two regions to add/subtract specified amounts of energy to
 both regions.  In both cases, the resulting temperatures of the two
 regions can be monitored with the &#8220;compute temp/region&#8221; command and
 the temperature profile of the intermediate region can be monitored
-with the <a class="reference internal" href="fix_ave_chunk.html"><span class="doc">fix ave/chunk</span></a> and <a class="reference internal" href="compute_ke_atom.html"><span class="doc">compute ke/atom</span></a> commands.</p>
+with the <span class="xref doc">fix ave/spatial</span> and <a class="reference internal" href="compute_ke_atom.html"><span class="doc">compute ke/atom</span></a> commands.</p>
 <p>The third method is to perform a reverse non-equilibrium MD simulation
 using the <a class="reference internal" href="fix_thermal_conductivity.html"><span class="doc">fix thermal/conductivity</span></a>
 command which implements the rNEMD algorithm of Muller-Plathe.
 Kinetic energy is swapped between atoms in two different layers of the
 simulation box.  This induces a temperature gradient between the two
-layers which can be monitored with the <a class="reference internal" href="fix_ave_chunk.html"><span class="doc">fix ave/chunk</span></a> and <a class="reference internal" href="compute_ke_atom.html"><span class="doc">compute ke/atom</span></a> commands.  The fix tallies the
+layers which can be monitored with the <span class="xref doc">fix ave/spatial</span> and <a class="reference internal" href="compute_ke_atom.html"><span class="doc">compute ke/atom</span></a> commands.  The fix tallies the
 cumulative energy transfer that it performs.  See the <a class="reference internal" href="fix_thermal_conductivity.html"><span class="doc">fix thermal/conductivity</span></a> command for
 details.</p>
 <p>The fourth method is based on the Green-Kubo (GK) formula which
@@ -2060,7 +2060,7 @@ used to shear the fluid in between them, again with some kind of
 thermostat that modifies only the thermal (non-shearing) components of
 velocity to prevent the fluid from heating up.</p>
 <p>In both cases, the velocity profile setup in the fluid by this
-procedure can be monitored by the <a class="reference internal" href="fix_ave_chunk.html"><span class="doc">fix ave/chunk</span></a> command, which determines
+procedure can be monitored by the <span class="xref doc">fix ave/spatial</span> command, which determines
 grad(Vstream) in the equation above.  E.g. the derivative in the
 y-direction of the Vx component of fluid motion or grad(Vstream) =
 dVx/dy.  The Pxy off-diagonal component of the pressure or stress
@@ -2073,7 +2073,7 @@ using the <a class="reference internal" href="fix_viscosity.html"><span class="d
 the rNEMD algorithm of Muller-Plathe.  Momentum in one dimension is
 swapped between atoms in two different layers of the simulation box in
 a different dimension.  This induces a velocity gradient which can be
-monitored with the <a class="reference internal" href="fix_ave_chunk.html"><span class="doc">fix ave/chunk</span></a> command.
+monitored with the <span class="xref doc">fix ave/spatial</span> command.
 The fix tallies the cummulative momentum transfer that it performs.
 See the <a class="reference internal" href="fix_viscosity.html"><span class="doc">fix viscosity</span></a> command for details.</p>
 <p>The fourth method is based on the Green-Kubo (GK) formula which
diff --git a/doc/html/Section_intro.html b/doc/html/Section_intro.html
index 3f41a6b156..7c6a5c2f0a 100644
--- a/doc/html/Section_intro.html
+++ b/doc/html/Section_intro.html
@@ -391,7 +391,7 @@ molecular dynamics options:</p>
 <li>coupled rigid body integration via the <a class="reference internal" href="fix_poems.html"><span class="doc">POEMS</span></a> library</li>
 <li><a class="reference internal" href="fix_qmmm.html"><span class="doc">QM/MM coupling</span></a></li>
 <li><a class="reference internal" href="fix_ipi.html"><span class="doc">path-integral molecular dynamics (PIMD)</span></a> and <a class="reference internal" href="fix_pimd.html"><span class="doc">this as well</span></a></li>
-<li>Monte Carlo via <a class="reference internal" href="fix_gcmc.html"><span class="doc">GCMC</span></a> and <a class="reference internal" href="fix_tfmc.html"><span class="doc">tfMC</span></a> <a class="reference internal" href="fix_atom_swap.html"><span class="doc">atom swapping</span></a> and <a class="reference internal" href="fix_bond_swap.html"><span class="doc">bond swapping</span></a></li>
+<li>Monte Carlo via <a class="reference internal" href="fix_gcmc.html"><span class="doc">GCMC</span></a> and <a class="reference internal" href="fix_tfmc.html"><span class="doc">tfMC</span></a> and <span class="xref doc">atom swapping</span></li>
 <li><a class="reference internal" href="pair_dsmc.html"><span class="doc">Direct Simulation Monte Carlo</span></a> for low-density fluids</li>
 <li><a class="reference internal" href="pair_peri.html"><span class="doc">Peridynamics mesoscale modeling</span></a></li>
 <li><a class="reference internal" href="fix_lb_fluid.html"><span class="doc">Lattice Boltzmann fluid</span></a></li>
diff --git a/doc/html/Section_packages.html b/doc/html/Section_packages.html
index f882ee234b..4522221d6c 100644
--- a/doc/html/Section_packages.html
+++ b/doc/html/Section_packages.html
@@ -1224,7 +1224,7 @@ styles which implement different materials models.</p>
 <p>Supporting info:
 <a class="reference external" href="PDF/PDLammps_overview.pdf">doc/PDF/PDLammps_overview.pdf</a>,
 <a class="reference external" href="PDF/PDLammps_EPS.pdf">doc/PDF/PDLammps_EPS.pdf</a>,
-<a class="reference external" href="PDF/PDLammps_VES.pdf">doc/PDF/PDLammps_VES.pdf</a>, <a class="reference internal" href="atom_style.html"><span class="doc">atom_style peri</span></a>, <a class="reference internal" href="compute_damage_atom.html"><span class="doc">compute damage/atom</span></a>,
+<a class="reference external" href="PDF/PDLammps_VES.pdf">doc/PDF/PDLammps_VES.pdf</a>, <a class="reference internal" href="atom_style.html"><span class="doc">atom_style peri</span></a>, <span class="xref doc">compute damage</span>,
 <a class="reference internal" href="pair_peri.html"><span class="doc">pair_style peri/pmb</span></a>, examples/peri</p>
 <hr class="docutils" />
 </div>
@@ -1272,8 +1272,9 @@ one step.  Type &#8220;python src/Make.py -h -poems&#8221; to see the details.</
 <span id="python"></span><h3>4.1.22. PYTHON package</h3>
 <p>Contents: A <a class="reference internal" href="python.html"><span class="doc">python</span></a> command which allow you to execute
 Python code from a LAMMPS input script.  The code can be in a separate
-file or embedded in the input script itself.  See <a class="reference external" href="Section_python.html&quot;">Section python 11.2</a> for an overview of using Python from
-LAMMPS and for other ways to use LAMMPS and Python together.</p>
+file or embedded in the input script itself.  See <span class="xref std std-ref">Section python 11.2</span> for an overview of using Python from
+LAMMPS and <a class="reference external" href="Section_python.html&quot;">Section python</a> for other ways to use
+LAMMPS and Python together.</p>
 <p>Building with the PYTHON package assumes you have a Python shared
 library available on your system, which needs to be a Python 2
 version, 2.6 or later.  Python 3 is not supported.  The build uses the
@@ -1426,7 +1427,7 @@ few large bodies or many small bodies.</p>
 </pre></div>
 </div>
 <p>Supporting info: <a class="reference internal" href="compute_erotate_rigid.html"><span class="doc">compute erotate/rigid</span></a>,
-<a class="reference internal" href="fix_shake.html"><span class="doc">fix shake</span></a>, <a class="reference internal" href="fix_shake.html"><span class="doc">fix rattle</span></a>, <a class="reference internal" href="fix_rigid.html"><span class="doc">fix rigid/*</span></a>, examples/ASPHERE, examples/rigid</p>
+<a class="reference internal" href="fix_shake.html"><span class="doc">fix shake</span></a>, <span class="xref doc">fix rattle</span>, <a class="reference internal" href="fix_rigid.html"><span class="doc">fix rigid/*</span></a>, examples/ASPHERE, examples/rigid</p>
 <hr class="docutils" />
 </div>
 <div class="section" id="shock-package">
@@ -1474,8 +1475,8 @@ properties of the potential are also included.</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">Make</span><span class="o">.</span><span class="n">py</span> <span class="o">-</span><span class="n">p</span> <span class="o">^</span><span class="n">snap</span> <span class="o">-</span><span class="n">a</span> <span class="n">machine</span>
 </pre></div>
 </div>
-<p>Supporting info: <a class="reference internal" href="pair_snap.html"><span class="doc">pair snap</span></a>, <a class="reference internal" href="compute_sna_atom.html"><span class="doc">compute sna/atom</span></a>, <a class="reference internal" href="compute_sna_atom.html"><span class="doc">compute snad/atom</span></a>,
-<a class="reference internal" href="compute_sna_atom.html"><span class="doc">compute snav/atom</span></a>, examples/snap</p>
+<p>Supporting info: <a class="reference internal" href="pair_snap.html"><span class="doc">pair snap</span></a>, <a class="reference internal" href="compute_sna_atom.html"><span class="doc">compute sna/atom</span></a>, <span class="xref doc">compute snad/atom</span>,
+<span class="xref doc">compute snav/atom</span>, examples/snap</p>
 <hr class="docutils" />
 </div>
 <div class="section" id="srd-package">
@@ -1899,7 +1900,7 @@ src/Make.py -h -voronoi&#8221; to see the details.</p>
 <tr class="row-odd"><td><a class="reference internal" href="#user-tally"><span class="std std-ref">USER-TALLY</span></a></td>
 <td>Pairwise tallied computes</td>
 <td>Axel Kohlmeyer (Temple U)</td>
-<td><a class="reference internal" href="compute_tally.html"><span class="doc">compute XXX/tally</span></a></td>
+<td><span class="xref doc">compute</span></td>
 <td>USER/tally</td>
 <td><ul class="first last simple">
 <li></li>
@@ -1913,7 +1914,7 @@ src/Make.py -h -voronoi&#8221; to see the details.</p>
 <tr class="row-even"><td><a class="reference internal" href="#user-vtk"><span class="std std-ref">USER-VTK</span></a></td>
 <td>VTK-style dumps</td>
 <td>Berger and Queteschiner (6)</td>
-<td><a class="reference internal" href="dump_custom_vtk.html"><span class="doc">compute custom/vtk</span></a></td>
+<td><span class="xref doc">compute custom/vtk</span></td>
 <td><ul class="first last simple">
 <li></li>
 </ul>
@@ -2003,7 +2004,7 @@ Dynamics.  This package implements an atom, pair, and fix style which
 allows electrons to be treated as explicit particles in an MD
 calculation.  See src/USER-AWPMD/README for more details.</p>
 <p>To build LAMMPS with this package ...</p>
-<p>Supporting info: src/USER-AWPMD/README, <a class="reference internal" href="pair_awpmd.html"><span class="doc">fix awpmd/cut</span></a>, examples/USER/awpmd</p>
+<p>Supporting info: src/USER-AWPMD/README, <span class="xref doc">fix awpmd/cut</span>, examples/USER/awpmd</p>
 <p>Author: Ilya Valuev at the JIHT in Russia (valuev at
 physik.hu-berlin.de).  Contact him directly if you have questions.</p>
 <hr class="docutils" />
@@ -2046,7 +2047,7 @@ have questions.</p>
 calculating x-ray and electron diffraction intensities based on
 kinematic diffraction theory.  See src/USER-DIFFRACTION/README for
 more details.</p>
-<p>Supporting info: <a class="reference internal" href="compute_saed.html"><span class="doc">compute saed</span></a>, <a class="reference internal" href="compute_xrd.html"><span class="doc">compute xrd</span></a>, <a class="reference internal" href="fix_saed_vtk.html"><span class="doc">fix saed/vtk</span></a>,
+<p>Supporting info: <a class="reference internal" href="compute_saed.html"><span class="doc">compute saed</span></a>, <a class="reference internal" href="compute_xrd.html"><span class="doc">compute xrd</span></a>, <a class="reference internal" href="fix_saed_vtk.html"><span class="doc">fix saed.vtk</span></a>,
 examples/USER/diffraction</p>
 <p>Author: Shawn P. Coleman (shawn.p.coleman8.ctr at mail.mil) while at
 the University of Arkansas.  Contact him directly if you have
@@ -2064,13 +2065,11 @@ equations of motion are integrated efficiently through the Shardlow
 splitting algorithm.  See src/USER-DPD/README for more details.</p>
 <p>Supporting info: /src/USER-DPD/README, <a class="reference internal" href="compute_dpd.html"><span class="doc">compute dpd</span></a>
 <a class="reference internal" href="compute_dpd_atom.html"><span class="doc">compute dpd/atom</span></a>
-<a class="reference internal" href="fix_eos_table.html"><span class="doc">fix eos/cv</span></a> <a class="reference internal" href="fix_eos_table.html"><span class="doc">fix eos/table</span></a></p>
-<blockquote>
-<div><a class="reference internal" href="fix_eos_table_rx.html"><span class="doc">fix eos/table/rx</span></a> <a class="reference internal" href="fix_shardlow.html"><span class="doc">fix shardlow</span></a></div></blockquote>
-<p><a class="reference internal" href="fix_rx.html"><span class="doc">fix rx</span></a> <a class="reference internal" href="pair_table_rx.html"><span class="doc">pair table/rx</span></a>
-<a class="reference internal" href="pair_dpd_fdt.html"><span class="doc">pair dpd/fdt</span></a> <a class="reference internal" href="pair_dpd_fdt.html"><span class="doc">pair dpd/fdt/energy</span></a>
-<a class="reference internal" href="pair_exp6_rx.html"><span class="doc">pair exp6/rx</span></a> <a class="reference internal" href="pair_multi_lucy.html"><span class="doc">pair multi/lucy</span></a>
-<a class="reference internal" href="pair_multi_lucy_rx.html"><span class="doc">pair multi/lucy/rx</span></a>, examples/USER/dpd</p>
+<a class="reference internal" href="fix_eos_table.html"><span class="doc">fix eos/cv</span></a> <a class="reference internal" href="fix_eos_table.html"><span class="doc">fix eos/table</span></a>
+<a class="reference internal" href="fix_shardlow.html"><span class="doc">fix shardlow</span></a>
+<span class="xref doc">pair_dpd/conservative</span>
+<a class="reference internal" href="pair_dpd_fdt.html"><span class="doc">pair_dpd/fdt</span></a>
+<a class="reference internal" href="pair_dpd_fdt.html"><span class="doc">pair_dpd/fdt/energy</span></a>, examples/USER/dpd</p>
 <p>Authors: James Larentzos (ARL) (james.p.larentzos.civ at mail.mil),
 Timothy Mattox (Engility Corp) (Timothy.Mattox at engilitycorp.com)
 and John Brennan (ARL) (john.k.brennan.civ at mail.mil).  Contact them
@@ -2145,7 +2144,7 @@ this package.  Also see src/USER-INTEL/README for more details. See
 the KOKKOS, OPT, and USER-OMP packages, which also have CPU and
 Phi-enabled styles.</p>
 <p>Supporting info: examples/accelerate, src/USER-INTEL/TEST</p>
-<p><a class="reference internal" href="Section_accelerate.html#acc-3"><span class="std std-ref">Section_accelerate</span></a></p>
+<p><span class="xref std std-ref">Section_accelerate</span></p>
 <p>Author: Mike Brown at Intel (michael.w.brown at intel.com).  Contact
 him directly if you have questions.</p>
 <p>For the USER-INTEL package, you have 2 choices when building.  You can
@@ -2262,7 +2261,7 @@ to VMD, support for new file formats can be added to LAMMPS (or VMD or
 other programs that use them) without having to recompile the
 application itself.</p>
 <p>See this doc page to get started:</p>
-<p><a class="reference internal" href="dump_molfile.html"><span class="doc">dump molfile</span></a></p>
+<p><span class="xref std std-ref">dump molfile</span></p>
 <p>The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.</p>
 <hr class="docutils" />
@@ -2274,7 +2273,7 @@ application itself.</p>
 other optimizations of various LAMMPS pair styles, dihedral
 styles, and fix styles.</p>
 <p>See this section of the manual to get started:</p>
-<p><a class="reference internal" href="Section_accelerate.html#acc-3"><span class="std std-ref">Section_accelerate</span></a></p>
+<p><span class="xref std std-ref">Section_accelerate</span></p>
 <p>The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.</p>
 <p>For the USER-OMP package, your Makefile.machine needs additional
diff --git a/doc/html/Section_start.html b/doc/html/Section_start.html
index 4d422cdc2a..b31c4631b5 100644
--- a/doc/html/Section_start.html
+++ b/doc/html/Section_start.html
@@ -1365,7 +1365,7 @@ supercomputer there may be dozens or 1000s of physical nodes.</p>
 Note that the keywords do not use a leading minus sign.  I.e. the
 keyword is &#8220;t&#8221;, not &#8220;-t&#8221;.  Also note that each of the keywords has a
 default setting.  Example of when to use these options and what
-settings to use on different platforms is given in <a class="reference internal" href="Section_accelerate.html#acc-3"><span class="std std-ref">Section 5.8</span></a>.</p>
+settings to use on different platforms is given in <span class="xref std std-ref">Section 5.8</span>.</p>
 <ul class="simple">
 <li>d or device</li>
 <li>g or gpus</li>
diff --git a/doc/html/accelerate_kokkos.html b/doc/html/accelerate_kokkos.html
index e517e0e2c5..d48007a2ca 100644
--- a/doc/html/accelerate_kokkos.html
+++ b/doc/html/accelerate_kokkos.html
@@ -351,7 +351,7 @@ used if running with KOKKOS_DEVICES=Pthreads for pthreads.  It is not
 necessary for KOKKOS_DEVICES=OpenMP for OpenMP, because OpenMP
 provides alternative methods via environment variables for binding
 threads to hardware cores.  More info on binding threads to cores is
-given in <a class="reference internal" href="Section_accelerate.html#acc-3"><span class="std std-ref">this section</span></a>.</p>
+given in <span class="xref std std-ref">this section</span>.</p>
 <p>KOKKOS_ARCH=KNC enables compiler switches needed when compling for an
 Intel Phi processor.</p>
 <p>KOKKOS_USE_TPLS=librt enables use of a more accurate timer mechanism
diff --git a/doc/html/body.html b/doc/html/body.html
index 5b3e3d10df..6a2be8aff8 100644
--- a/doc/html/body.html
+++ b/doc/html/body.html
@@ -302,6 +302,11 @@ sphere is determined by the <em>bflag1</em> parameter for the <em>body</em> keyw
 The <em>bflag2</em> argument is ignored.</p>
 <hr class="docutils" />
 <p><strong>Specifics of body style rounded/polygon:</strong></p>
+<div class="admonition note">
+<p class="first admonition-title">Note</p>
+<p class="last">Aug 2016 - This body style has not yet been added to LAMMPS.
+The info below is a placeholder.</p>
+</div>
 <p>The <em>rounded/polygon</em> body style represents body particles as a convex
 polygon with a variable number N &gt; 2 of vertices, which can only be
 used for 2d models.  One example use of this body style is for 2d
@@ -359,8 +364,9 @@ particles whose edge length is sqrt(2):</p>
 <span class="mf">1.0</span>
 </pre></div>
 </div>
-<p>The <span class="xref doc">pair_style body/rounded/polygon</span> command
-can be used with this body style to compute body/body interactions.</p>
+<p>The <span class="xref doc">pair_style body/rounded/polygon</span>
+command can be used with this body style to compute body/body
+interactions.</p>
 <p>For output purposes via the <a class="reference internal" href="compute_body_local.html"><span class="doc">compute body/local</span></a> and <a class="reference internal" href="dump.html"><span class="doc">dump local</span></a>
 commands, this body style produces one datum for each of the N
 sub-particles in a body particle.  The datum has 3 values:</p>
diff --git a/doc/html/compute_damage_atom.html b/doc/html/compute_damage_atom.html
index 97a0eba0d0..bc378e49e8 100644
--- a/doc/html/compute_damage_atom.html
+++ b/doc/html/compute_damage_atom.html
@@ -170,8 +170,7 @@ LAMMPS was built with that package.  See the <a class="reference internal" href=
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
-<p><a class="reference internal" href="compute_dilatation_atom.html"><span class="doc">compute dilatation/atom</span></a>,
-<a class="reference internal" href="compute_plasticity_atom.html"><span class="doc">compute plasticity/atom</span></a></p>
+<p><span class="xref doc">compute dilatation</span>, <span class="xref doc">compute plasticity</span></p>
 <p><strong>Default:</strong> none</p>
 </div>
 </div>
diff --git a/doc/html/compute_dilatation_atom.html b/doc/html/compute_dilatation_atom.html
index 798663b6a8..cf1947f3b2 100644
--- a/doc/html/compute_dilatation_atom.html
+++ b/doc/html/compute_dilatation_atom.html
@@ -172,8 +172,7 @@ LAMMPS was built with that package.  See the <a class="reference internal" href=
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
-<p><a class="reference internal" href="compute_damage_atom.html"><span class="doc">compute damage/atom</span></a>,
-<a class="reference internal" href="compute_plasticity_atom.html"><span class="doc">compute plasticity/atom</span></a></p>
+<p><span class="xref doc">compute damage</span>, <span class="xref doc">compute plasticity</span></p>
 <p><strong>Default:</strong> none</p>
 </div>
 </div>
diff --git a/doc/html/compute_erotate_rigid.html b/doc/html/compute_erotate_rigid.html
index 501319d8c8..0e46c9e7fe 100644
--- a/doc/html/compute_erotate_rigid.html
+++ b/doc/html/compute_erotate_rigid.html
@@ -172,7 +172,7 @@ LAMMPS was built with that package.  See the <a class="reference internal" href=
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
-<p><a class="reference internal" href="compute_ke_rigid.html"><span class="doc">compute ke/rigid</span></a></p>
+<p><span class="xref doc">compute ke/rigid</span></p>
 <p><strong>Default:</strong> none</p>
 </div>
 </div>
diff --git a/doc/html/compute_plasticity_atom.html b/doc/html/compute_plasticity_atom.html
index 54bffecbd2..11518b0c64 100644
--- a/doc/html/compute_plasticity_atom.html
+++ b/doc/html/compute_plasticity_atom.html
@@ -168,8 +168,7 @@ LAMMPS was built with that package.  See the <a class="reference internal" href=
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
-<p><a class="reference internal" href="compute_damage_atom.html"><span class="doc">compute damage/atom</span></a>,
-<a class="reference internal" href="compute_dilatation_atom.html"><span class="doc">compute dilatation/atom</span></a></p>
+<p><span class="xref doc">compute damage</span>, <span class="xref doc">compute dilatation</span></p>
 <p><strong>Default:</strong> none</p>
 <hr class="docutils" />
 <p id="mitchell"><strong>(Mitchell)</strong> Mitchell, &#8220;A non-local, ordinary-state-based
diff --git a/doc/html/compute_reduce.html b/doc/html/compute_reduce.html
index 519bc9ff3c..292fa820cb 100644
--- a/doc/html/compute_reduce.html
+++ b/doc/html/compute_reduce.html
@@ -220,7 +220,7 @@ asterisk means all indices from n to N (inclusive).  A middle asterisk
 means all indices from m to n (inclusive).</p>
 <p>Using a wildcard is the same as if the individual columns of the array
 had been listed one by one.  E.g. these 2 compute reduce commands are
-equivalent, since the <a class="reference internal" href="compute_stress_atom.html"><span class="doc">compute stress/atom</span></a>
+equivalent, since the <span class="xref doc">compute stress/atom</span>
 command creates a per-atom array with 6 columns:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">compute</span> <span class="n">myPress</span> <span class="nb">all</span> <span class="n">stress</span><span class="o">/</span><span class="n">atom</span> <span class="n">NULL</span>
 <span class="n">compute</span> <span class="mi">2</span> <span class="nb">all</span> <span class="n">reduce</span> <span class="nb">min</span> <span class="n">myPress</span><span class="p">[</span><span class="o">*</span><span class="p">]</span>
diff --git a/doc/html/compute_temp_deform_eff.html b/doc/html/compute_temp_deform_eff.html
index d83f6011d6..748ee79b41 100644
--- a/doc/html/compute_temp_deform_eff.html
+++ b/doc/html/compute_temp_deform_eff.html
@@ -149,11 +149,11 @@ nuclei and electrons in the <a class="reference internal" href="pair_eff.html"><
 model, after subtracting out a streaming velocity induced by the
 simulation box changing size and/or shape, for example in a
 non-equilibrium MD (NEMD) simulation.  The size/shape change is
-induced by use of the <a class="reference internal" href="fix_deform.html"><span class="doc">fix deform</span></a> command.  A
+induced by use of the <span class="xref doc">fix deform/eff</span> command.  A
 compute of this style is created by the <a class="reference internal" href="fix_nvt_sllod_eff.html"><span class="doc">fix nvt/sllod/eff</span></a> command to compute the thermal
 temperature of atoms for thermostatting purposes.  A compute of this
 style can also be used by any command that computes a temperature,
-e.g. <a class="reference internal" href="thermo_modify.html"><span class="doc">thermo_modify</span></a>, <a class="reference internal" href="fix_nh_eff.html"><span class="doc">fix npt/eff</span></a>,
+e.g. <a class="reference internal" href="thermo_modify.html"><span class="doc">thermo_modify</span></a>, <a class="reference internal" href="fix_nh.html"><span class="doc">fix npt/eff</span></a>,
 etc.</p>
 <p>The calculation performed by this compute is exactly like that
 described by the <a class="reference internal" href="compute_temp_deform.html"><span class="doc">compute temp/deform</span></a>
@@ -180,8 +180,7 @@ LAMMPS was built with that package.  See the <a class="reference internal" href=
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
-<p><a class="reference internal" href="compute_temp_ramp.html"><span class="doc">compute temp/ramp</span></a>, <a class="reference internal" href="fix_deform.html"><span class="doc">fix deform</span></a>,
-<a class="reference internal" href="fix_nvt_sllod_eff.html"><span class="doc">fix nvt/sllod/eff</span></a></p>
+<p><a class="reference internal" href="compute_temp_ramp.html"><span class="doc">compute temp/ramp</span></a>, <span class="xref doc">fix deform/eff</span>, <a class="reference internal" href="fix_nvt_sllod_eff.html"><span class="doc">fix nvt/sllod/eff</span></a></p>
 <p><strong>Default:</strong> none</p>
 </div>
 </div>
diff --git a/doc/html/compute_temp_eff.html b/doc/html/compute_temp_eff.html
index 4dadf7de39..e4540a68e8 100644
--- a/doc/html/compute_temp_eff.html
+++ b/doc/html/compute_temp_eff.html
@@ -148,7 +148,7 @@
 <p>Define a computation that calculates the temperature of a group of
 nuclei and electrons in the <a class="reference internal" href="pair_eff.html"><span class="doc">electron force field</span></a>
 model.  A compute of this style can be used by commands that compute a
-temperature, e.g. <a class="reference internal" href="thermo_modify.html"><span class="doc">thermo_modify</span></a>, <a class="reference internal" href="fix_nh_eff.html"><span class="doc">fix npt/eff</span></a>, etc.</p>
+temperature, e.g. <a class="reference internal" href="thermo_modify.html"><span class="doc">thermo_modify</span></a>, <span class="xref doc">fix npt/eff</span>, etc.</p>
 <p>The temperature is calculated by the formula KE = dim/2 N k T, where
 KE = total kinetic energy of the group of atoms (sum of 1/2 m v^2 for
 nuclei and sum of 1/2 (m v^2 + 3/4 m s^2) for electrons, where s
diff --git a/doc/html/dihedral_table.html b/doc/html/dihedral_table.html
index d967731158..91abb1817f 100644
--- a/doc/html/dihedral_table.html
+++ b/doc/html/dihedral_table.html
@@ -301,7 +301,7 @@ more instructions on how to use the accelerated styles effectively.</p>
 <div class="section" id="restrictions">
 <h2>Restrictions</h2>
 <p>This dihedral style can only be used if LAMMPS was built with the
-USER-MISC package.  See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a>
+USER-MISC package.  See the <span class="xref std std-ref">Making LAMMPS</span>
 section for more info on packages.</p>
 </div>
 <div class="section" id="related-commands">
diff --git a/doc/html/dump.html b/doc/html/dump.html
index b47465bc33..40449d1202 100644
--- a/doc/html/dump.html
+++ b/doc/html/dump.html
@@ -550,7 +550,7 @@ indices from n to N (inclusive).  A middle asterisk means all indices
 from m to n (inclusive).</p>
 <p>Using a wildcard is the same as if the individual columns of the array
 had been listed one by one.  E.g. these 2 dump commands are
-equivalent, since the <a class="reference internal" href="compute_stress_atom.html"><span class="doc">compute stress/atom</span></a>
+equivalent, since the <span class="xref doc">compute stress/atom</span>
 command creates a per-atom array with 6 columns:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">compute</span> <span class="n">myPress</span> <span class="nb">all</span> <span class="n">stress</span><span class="o">/</span><span class="n">atom</span> <span class="n">NULL</span>
 <span class="n">dump</span> <span class="mi">2</span> <span class="nb">all</span> <span class="n">custom</span> <span class="mi">100</span> <span class="n">tmp</span><span class="o">.</span><span class="n">dump</span> <span class="nb">id</span> <span class="n">myPress</span><span class="p">[</span><span class="o">*</span><span class="p">]</span>
diff --git a/doc/html/dump_custom_vtk.html b/doc/html/dump_custom_vtk.html
index 8bd838f671..d23a7c09f7 100644
--- a/doc/html/dump_custom_vtk.html
+++ b/doc/html/dump_custom_vtk.html
@@ -157,7 +157,7 @@ mol = molecule ID
 proc = ID of processor that owns atom
 procp1 = ID+1 of processor that owns atom
 type = atom type
-element = name of atom element, as defined by <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify</span></a> command
+element = name of atom element, as defined by <span class="xref doc">dump_modify</span> command
 mass = atom mass
 x,y,z = unscaled atom coordinates
 xs,ys,zs = scaled atom coordinates
@@ -194,7 +194,7 @@ depending on the filename extension specified. This can be either
 for the XML format; see the <a class="reference external" href="http://www.vtk.org/VTK/img/file-formats.pdf">VTK homepage</a> for a detailed
 description of these formats.  Since this naming convention conflicts
 with the way binary output is usually specified (see below),
-<a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify binary</span></a> allows to set the binary
+<span class="xref doc">dump_modify binary</span> allows to set the binary
 flag for this dump style explicitly.</p>
 </div>
 <div class="section" id="description">
@@ -203,9 +203,9 @@ flag for this dump style explicitly.</p>
 timesteps in a format readable by the <a class="reference external" href="http://www.vtk.org">VTK visualization toolkit</a> or other visualization tools that use it,
 e.g. <a class="reference external" href="http://www.paraview.org">ParaView</a>.  The timesteps on which dump
 output is written can also be controlled by a variable; see the
-<a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify every</span></a> command for details.</p>
+<span class="xref doc">dump_modify every</span> command for details.</p>
 <p>Only information for atoms in the specified group is dumped.  The
-<a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify thresh and region</span></a> commands can also
+<span class="xref doc">dump_modify thresh and region</span> commands can also
 alter what atoms are included; see details below.</p>
 <p>As described below, special characters (&#8220;*&#8221;, &#8220;%&#8221;) in the filename
 determine the kind of output.</p>
@@ -218,7 +218,7 @@ box.</p>
 </div>
 <div class="admonition warning">
 <p class="first admonition-title">Warning</p>
-<p class="last">Unless the <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify sort</span></a>
+<p class="last">Unless the <span class="xref doc">dump_modify sort</span>
 option is invoked, the lines of atom information written to dump files
 will be in an indeterminate order for each snapshot.  This is even
 true when running on a single processor, if the <a class="reference internal" href="atom_modify.html"><span class="doc">atom_modify sort</span></a> option is on, which it is by default.  In this
@@ -228,7 +228,7 @@ data for a single snapshot is collected from multiple processors, each
 of which owns a subset of the atoms.</p>
 </div>
 <p>For the <em>custom/vtk</em> style, sorting is off by default. See the
-<a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify</span></a> doc page for details.</p>
+<span class="xref doc">dump_modify</span> doc page for details.</p>
 <hr class="docutils" />
 <p>The dimensions of the simulation box are written to a separate file
 for each snapshot (either in legacy VTK or XML format depending on
@@ -261,20 +261,20 @@ timestep 0) and on the last timestep of a minimization if the
 minimization converges.  Note that this means a dump will not be
 performed on the initial timestep after the dump command is invoked,
 if the current timestep is not a multiple of N.  This behavior can be
-changed via the <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify first</span></a> command, which
+changed via the <span class="xref doc">dump_modify first</span> command, which
 can also be useful if the dump command is invoked after a minimization
 ended on an arbitrary timestep.  N can be changed between runs by
-using the <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify every</span></a> command.
-The <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify every</span></a> command
+using the <span class="xref doc">dump_modify every</span> command.
+The <span class="xref doc">dump_modify every</span> command
 also allows a variable to be used to determine the sequence of
 timesteps on which dump files are written.  In this mode a dump on the
 first timestep of a run will also not be written unless the
-<a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify first</span></a> command is used.</p>
+<span class="xref doc">dump_modify first</span> command is used.</p>
 <p>Dump filenames can contain two wildcard characters.  If a &#8220;*&#8221;
 character appears in the filename, then one file per snapshot is
 written and the &#8220;*&#8221; character is replaced with the timestep value.
 For example, tmp.dump*.vtk becomes tmp.dump0.vtk, tmp.dump10000.vtk,
-tmp.dump20000.vtk, etc.  Note that the <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify pad</span></a>
+tmp.dump20000.vtk, etc.  Note that the <span class="xref doc">dump_modify pad</span>
 command can be used to insure all timestep numbers are the same length
 (e.g. 00010), which can make it easier to read a series of dump files
 in order with some post-processing tools.</p>
@@ -286,7 +286,7 @@ tmp.dump_P-1.vtp, etc.  This creates smaller files and can be a fast
 mode of output on parallel machines that support parallel I/O for output.</p>
 <p>By default, P = the number of processors meaning one file per
 processor, but P can be set to a smaller value via the <em>nfile</em> or
-<em>fileper</em> keywords of the <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify</span></a> command.
+<em>fileper</em> keywords of the <span class="xref doc">dump_modify</span> command.
 These options can be the most efficient way of writing out dump files
 when running on large numbers of processors.</p>
 <p>For the legacy VTK format &#8220;%&#8221; is ignored and P = 1, i.e., only
@@ -305,7 +305,7 @@ part of the <em>custom/vtk</em> style.</p>
 <p><em>id</em> is the atom ID.  <em>mol</em> is the molecule ID, included in the data
 file for molecular systems.  <em>type</em> is the atom type.  <em>element</em> is
 typically the chemical name of an element, which you must assign to
-each type via the <a class="reference internal" href="dump_modify.html"><span class="doc">dump_modify element</span></a> command.
+each type via the <span class="xref doc">dump_modify element</span> command.
 More generally, it can be any string you wish to associate with an
 atom type.  <em>mass</em> is the atom mass.  <em>vx</em>, <em>vy</em>, <em>vz</em>, <em>fx</em>, <em>fy</em>,
 <em>fz</em>, and <em>q</em> are components of atom velocity and force and atomic
diff --git a/doc/html/dump_image.html b/doc/html/dump_image.html
index 322171c9e1..2df937c169 100644
--- a/doc/html/dump_image.html
+++ b/doc/html/dump_image.html
@@ -481,6 +481,11 @@ change this via the <a class="reference internal" href="dump_modify.html"><span
 <p>The <em>fix</em> keyword can be used with a <a class="reference internal" href="fix.html"><span class="doc">fix</span></a> that produces
 objects to be drawn.  An example is the <span class="xref doc">fix surface/global</span> command which can draw lines
 or triangles for 2d/3d simulations.</p>
+<div class="admonition note">
+<p class="first admonition-title">Note</p>
+<p class="last">Aug 2016 - The fix surface/global command is not yet added to
+LAMMPS.</p>
+</div>
 <p>The <em>fflag1</em> and <em>fflag2</em> settings are numerical values which are
 passed to the fix to affect how the drawing of its objects is done.
 See the individual fix doc page for a description of what these
diff --git a/doc/html/fix_ave_chunk.html b/doc/html/fix_ave_chunk.html
index 564b91e3ef..e7bfd31839 100644
--- a/doc/html/fix_ave_chunk.html
+++ b/doc/html/fix_ave_chunk.html
@@ -251,7 +251,7 @@ asterisk means all indices from n to N (inclusive).  A middle asterisk
 means all indices from m to n (inclusive).</p>
 <p>Using a wildcard is the same as if the individual columns of the array
 had been listed one by one.  E.g. these 2 fix ave/chunk commands are
-equivalent, since the <a class="reference internal" href="compute_property_atom.html"><span class="doc">compute property/atom</span></a> command creates, in this
+equivalent, since the <span class="xref doc">compute property/atom</span> command creates, in this
 case, a per-atom array with 3 columns:</p>
 <div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">compute</span> <span class="n">myAng</span> <span class="nb">all</span> <span class="nb">property</span><span class="o">/</span><span class="n">atom</span> <span class="n">angmomx</span> <span class="n">angmomy</span> <span class="n">angmomz</span>
 <span class="n">fix</span> <span class="mi">1</span> <span class="nb">all</span> <span class="n">ave</span><span class="o">/</span><span class="n">chunk</span> <span class="mi">100</span> <span class="mi">1</span> <span class="mi">100</span> <span class="n">cc1</span> <span class="n">c_myAng</span><span class="p">[</span><span class="o">*</span><span class="p">]</span> <span class="n">file</span> <span class="n">tmp</span><span class="o">.</span><span class="n">angmom</span>
@@ -262,7 +262,7 @@ case, a per-atom array with 3 columns:</p>
 <p class="first admonition-title">Note</p>
 <p class="last">This fix works by creating an array of size <em>Nchunk</em> by Nvalues
 on each processor.  <em>Nchunk</em> is the number of chunks which is defined
-by the <a class="reference internal" href="compute_chunk_atom.html"><span class="doc">compute chunk/atom</span></a> command.
+by the <span class="xref doc">compute chunk/atom</span> command.
 Nvalues is the number of input values specified.  Each processor loops
 over its atoms, tallying its values to the appropriate chunk.  Then
 the entire array is summed across all processors.  This means that
diff --git a/doc/html/fix_bond_create.html b/doc/html/fix_bond_create.html
index 8ab79a00a5..a403dbfc40 100644
--- a/doc/html/fix_bond_create.html
+++ b/doc/html/fix_bond_create.html
@@ -237,7 +237,7 @@ become one moleclue due to the created bond, all atoms in the new
 moleclue retain their original molecule IDs.</p>
 </div>
 <p>If the <em>atype</em> keyword is used and if an angle potential is defined
-via the <a class="reference internal" href="angle_style.html"><span class="doc">angle_style</span></a> command, then any new 3-body
+via the <span class="xref doc">angle_style</span> command, then any new 3-body
 interactions inferred by the creation of a bond will create new angles
 of type <em>angletype</em>, with parameters assigned by the corresponding
 <a class="reference internal" href="angle_coeff.html"><span class="doc">angle_coeff</span></a> command.  Likewise, the <em>dtype</em> and
diff --git a/doc/html/fix_deform.html b/doc/html/fix_deform.html
index e8ad31ffb0..5177899d75 100644
--- a/doc/html/fix_deform.html
+++ b/doc/html/fix_deform.html
@@ -570,7 +570,7 @@ is not consistent with fix nvt/sllod.</p>
 <p class="last">For non-equilibrium MD (NEMD) simulations using &#8220;remap v&#8221; it is
 usually desirable that the fluid (or flowing material, e.g. granular
 particles) stream with a velocity profile consistent with the
-deforming box.  As mentioned above, using a thermostat such as <a class="reference internal" href="fix_nvt_sllod.html"><span class="doc">fix nvt/sllod</span></a> or <a class="reference internal" href="fix_langevin.html"><span class="doc">fix lavgevin</span></a>
+deforming box.  As mentioned above, using a thermostat such as <a class="reference internal" href="fix_nvt_sllod.html"><span class="doc">fix nvt/sllod</span></a> or <span class="xref doc">fix lavgevin</span>
 (with a bias provided by <a class="reference internal" href="compute_temp_deform.html"><span class="doc">compute temp/deform</span></a>), will typically accomplish
 that.  If you do not use a thermostat, then there is no driving force
 pushing the atoms to flow in a manner consistent with the deforming
diff --git a/doc/html/fix_deposit.html b/doc/html/fix_deposit.html
index 6e7e0be54f..1e1e637b35 100644
--- a/doc/html/fix_deposit.html
+++ b/doc/html/fix_deposit.html
@@ -264,7 +264,7 @@ time a molecule is deposited, a random number is used to sample from
 the list of relative probabilities.  The N values must sum to 1.0.</p>
 <p>If you wish to insert molecules via the <em>mol</em> keyword, that will be
 treated as rigid bodies, use the <em>rigid</em> keyword, specifying as its
-value the ID of a separate <a class="reference internal" href="fix_rigid.html"><span class="doc">fix rigid/small</span></a>
+value the ID of a separate <span class="xref doc">fix rigid/small</span>
 command which also appears in your input script.</p>
 <p>If you wish to insert molecules via the <em>mol</em> keyword, that will have
 their bonds or angles constrained via SHAKE, use the <em>shake</em> keyword,
diff --git a/doc/html/fix_ehex.html b/doc/html/fix_ehex.html
index f1cc66f404..7e7a529acc 100644
--- a/doc/html/fix_ehex.html
+++ b/doc/html/fix_ehex.html
@@ -231,7 +231,7 @@ resulting temperature profile will therefore be the same.</p>
 the keyword <em>hex</em> is specified.</p>
 <hr class="docutils" />
 <p><strong>Compatibility with SHAKE and RATTLE (rigid molecules)</strong>:</p>
-<p>This fix is compatible with <a class="reference internal" href="fix_shake.html"><span class="doc">fix shake</span></a> and <a class="reference internal" href="fix_shake.html"><span class="doc">fix rattle</span></a>. If either of these constraining algorithms is
+<p>This fix is compatible with <a class="reference internal" href="fix_shake.html"><span class="doc">fix shake</span></a> and <span class="xref doc">fix rattle</span>. If either of these constraining algorithms is
 specified in the input script and the keyword <em>constrain</em> is set, the
 bond distances will be corrected a second time at the end of the
 integration step.  It is recommended to specify the keyword <em>com</em> in
@@ -244,7 +244,7 @@ rescaling takes place if the centre of mass lies outside the region.</p>
 <p class="first admonition-title">Note</p>
 <p class="last">You can only use the keyword <em>com</em> along with <em>constrain</em>.</p>
 </div>
-<p>To achieve the highest accuracy it is recommended to use <a class="reference internal" href="fix_shake.html"><span class="doc">fix rattle</span></a> with the keywords <em>constrain</em> and <em>com</em> as
+<p>To achieve the highest accuracy it is recommended to use <span class="xref doc">fix rattle</span> with the keywords <em>constrain</em> and <em>com</em> as
 shown in the second example. Only if RATTLE is employed, the velocity
 constraints will be satisfied.</p>
 <div class="admonition note">
diff --git a/doc/html/fix_eos_table.html b/doc/html/fix_eos_table.html
index 91f10fe854..040335fdb5 100644
--- a/doc/html/fix_eos_table.html
+++ b/doc/html/fix_eos_table.html
@@ -187,7 +187,7 @@ identifies the section.  The line can contain additional text, but the
 initial text must match the argument specified in the fix command.</p>
 <p>The next line lists the number of table entries.  The parameter &#8220;N&#8221; is
 required and its value is the number of table entries that follow.
-Note that this may be different than the <em>N</em> specified in the <a class="reference internal" href="#"><span class="doc">fix eos/table</span></a> command.  Let Ntable = <em>N</em> in the fix
+Note that this may be different than the <em>N</em> specified in the <span class="xref doc">fix eos/table</span> command.  Let Ntable = <em>N</em> in the fix
 command, and Nfile = &#8220;N&#8221; in the tabulated file.  What LAMMPS does is a
 preliminary interpolation by creating splines using the Nfile
 tabulated values as nodal points.  It uses these to interpolate as
@@ -220,7 +220,7 @@ are not within the table cutoffs.</p>
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
-<p><a class="reference internal" href="fix_shardlow.html"><span class="doc">fix shardlow</span></a>, <a class="reference internal" href="pair_dpd_fdt.html"><span class="doc">pair dpd/fdt</span></a></p>
+<p><a class="reference internal" href="fix_shardlow.html"><span class="doc">fix shardlow</span></a>, <span class="xref doc">pair dpd/fdt</span></p>
 <p><strong>Default:</strong> none</p>
 </div>
 </div>
diff --git a/doc/html/fix_eos_table_rx.html b/doc/html/fix_eos_table_rx.html
index 49f1a58f76..95ecfb9ca7 100644
--- a/doc/html/fix_eos_table_rx.html
+++ b/doc/html/fix_eos_table_rx.html
@@ -242,7 +242,7 @@ are not within the table cutoffs.</p>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
 <p><a class="reference internal" href="fix_rx.html"><span class="doc">fix rx</span></a>,
-<a class="reference internal" href="pair_dpd_fdt.html"><span class="doc">pair dpd/fdt</span></a></p>
+<span class="xref doc">pair dpd/fdt</span></p>
 <p><strong>Default:</strong> none</p>
 <hr class="docutils" />
 </div>
diff --git a/doc/html/fix_lb_momentum.html b/doc/html/fix_lb_momentum.html
index aefe19517d..6d569721fe 100644
--- a/doc/html/fix_lb_momentum.html
+++ b/doc/html/fix_lb_momentum.html
@@ -172,7 +172,7 @@ dimension.</p>
 <h2>Restart, fix_modify, output, run start/stop, minimize info</h2>
 <p>No information about this fix is written to <a class="reference internal" href="restart.html"><span class="doc">binary restart files</span></a>.  None of the <a class="reference internal" href="fix_modify.html"><span class="doc">fix_modify</span></a> options
 are relevant to this fix.  No global or per-atom quantities are stored
-by this fix for access by various <a class="reference internal" href="Section_howto.html#howto-15"><span class="std std-ref">output commands</span></a>.  No parameter of this fix can be
+by this fix for access by various <span class="xref std std-ref">output commands</span>.  No parameter of this fix can be
 used with the <em>start/stop</em> keywords of the <a class="reference internal" href="run.html"><span class="doc">run</span></a> command.
 This fix is not invoked during <a class="reference internal" href="minimize.html"><span class="doc">energy minimization</span></a>.</p>
 </div>
diff --git a/doc/html/fix_lb_pc.html b/doc/html/fix_lb_pc.html
index 8a5281c6c3..5dbd07536f 100644
--- a/doc/html/fix_lb_pc.html
+++ b/doc/html/fix_lb_pc.html
@@ -154,7 +154,7 @@ algorithm if the force coupling constant has been set by default.</p>
 <h2>Restart, fix_modify, output, run start/stop, minimize info</h2>
 <p>No information about this fix is written to <a class="reference internal" href="restart.html"><span class="doc">binary restart files</span></a>.  None of the <a class="reference internal" href="fix_modify.html"><span class="doc">fix_modify</span></a> options
 are relevant to this fix.  No global or per-atom quantities are stored
-by this fix for access by various <a class="reference internal" href="Section_howto.html#howto-15"><span class="std std-ref">output commands</span></a>.  No parameter of this fix can be
+by this fix for access by various <span class="xref std std-ref">output commands</span>.  No parameter of this fix can be
 used with the <em>start/stop</em> keywords of the <a class="reference internal" href="run.html"><span class="doc">run</span></a> command.
 This fix is not invoked during <a class="reference internal" href="minimize.html"><span class="doc">energy minimization</span></a>.</p>
 </div>
diff --git a/doc/html/fix_nh_eff.html b/doc/html/fix_nh_eff.html
index 5e8d66853c..72d8ef25d4 100644
--- a/doc/html/fix_nh_eff.html
+++ b/doc/html/fix_nh_eff.html
@@ -203,7 +203,7 @@ to the temperature or kinetic energy from the electron radial velocity.</p>
 <div class="admonition note">
 <p class="first admonition-title">Note</p>
 <p class="last">there are two different pressures that can be reported for eFF
-when defining the pair_style (see <a class="reference internal" href="pair_eff.html"><span class="doc">pair eff/cut</span></a> to
+when defining the pair_style (see <span class="xref doc">pair eff/cut</span> to
 understand these settings), one (default) that considers electrons do
 not contribute radial virial components (i.e. electrons treated as
 incompressible &#8216;rigid&#8217; spheres) and one that does.  The radial
diff --git a/doc/html/fix_phonon.html b/doc/html/fix_phonon.html
index 40d369afee..0eed3acb9e 100644
--- a/doc/html/fix_phonon.html
+++ b/doc/html/fix_phonon.html
@@ -247,7 +247,7 @@ corresponding reciprocal lattice.</p>
 fix. You can use it to change the temperature compute from thermo_temp
 to the one that reflects the true temperature of atoms in the group.</p>
 <p>No global scalar or vector or per-atom quantities are stored by this
-fix for access by various <a class="reference internal" href="Section_howto.html#howto-15"><span class="std std-ref">output commands</span></a>.</p>
+fix for access by various <span class="xref std std-ref">output commands</span>.</p>
 <p>Instead, this fix outputs its initialization information (including
 mapping information) and the calculated dynamical matrices to the file
 <em>prefix</em>.log, with the specified <em>prefix</em>.  The dynamical matrices are
diff --git a/doc/html/fix_pour.html b/doc/html/fix_pour.html
index ca471ec2c3..ffe5fd5362 100644
--- a/doc/html/fix_pour.html
+++ b/doc/html/fix_pour.html
@@ -232,7 +232,7 @@ time a molecule is inserted, a random number is used to sample from
 the list of relative probabilities.  The N values must sum to 1.0.</p>
 <p>If you wish to insert molecules via the <em>mol</em> keyword, that will be
 treated as rigid bodies, use the <em>rigid</em> keyword, specifying as its
-value the ID of a separate <a class="reference internal" href="fix_rigid.html"><span class="doc">fix rigid/small</span></a>
+value the ID of a separate <span class="xref doc">fix rigid/small</span>
 command which also appears in your input script.</p>
 <p>If you wish to insert molecules via the <em>mol</em> keyword, that will have
 their bonds or angles constrained via SHAKE, use the <em>shake</em> keyword,
diff --git a/doc/html/fix_reax_bonds.html b/doc/html/fix_reax_bonds.html
index 0275b0ad0e..5e30cb4c61 100644
--- a/doc/html/fix_reax_bonds.html
+++ b/doc/html/fix_reax_bonds.html
@@ -155,7 +155,7 @@ specified by <a class="reference internal" href="pair_reax.html"><span class="do
 stand-alone ReaxFF code of Adri van Duin.  The bond information is
 written to <em>filename</em> on timesteps that are multiples of <em>Nevery</em>,
 including timestep 0.  For time-averaged chemical species analysis,
-please see the <a class="reference internal" href="fix_reaxc_species.html"><span class="doc">fix reaxc/c/species</span></a> command.</p>
+please see the <span class="xref doc">fix species</span> command.</p>
 <p>The format of the output file should be self-explantory.</p>
 </div>
 <hr class="docutils" />
diff --git a/doc/html/fix_rx.html b/doc/html/fix_rx.html
index b39fe17f46..fbc1c7b47b 100644
--- a/doc/html/fix_rx.html
+++ b/doc/html/fix_rx.html
@@ -292,7 +292,7 @@ enthalpy DPD simulation.</p>
 <h2>Related commands</h2>
 <p><a class="reference internal" href="fix_eos_table_rx.html"><span class="doc">fix eos/table/rx</span></a>,
 <a class="reference internal" href="fix_shardlow.html"><span class="doc">fix shardlow</span></a>,
-<a class="reference internal" href="pair_dpd_fdt.html"><span class="doc">pair dpd/fdt/energy</span></a></p>
+<span class="xref doc">pair dpd/fdt/energy</span></p>
 <p><strong>Default:</strong> none</p>
 </div>
 </div>
diff --git a/doc/html/fix_shardlow.html b/doc/html/fix_shardlow.html
index 204f18df58..e54353f7eb 100644
--- a/doc/html/fix_shardlow.html
+++ b/doc/html/fix_shardlow.html
@@ -148,7 +148,7 @@
 integrate the DPD equations of motion.  The SSA splits the integration
 into a stochastic and deterministic integration step.  The fix
 <em>shardlow</em> performs the stochastic integration step and must be used
-in conjunction with a deterministic integrator (e.g. <a class="reference internal" href="fix_nve.html"><span class="doc">fix nve</span></a> or <a class="reference internal" href="fix_nh.html"><span class="doc">fix nph</span></a>).  The stochastic
+in conjunction with a deterministic integrator (e.g. <a class="reference internal" href="fix_nve.html"><span class="doc">fix nve</span></a> or <span class="xref doc">fix nph</span>).  The stochastic
 integration of the dissipative and random forces is performed prior to
 the deterministic integration of the conservative force. Further
 details regarding the method are provided in <a class="reference internal" href="pair_dpd_fdt.html#lisal"><span class="std std-ref">(Lisal)</span></a> and
diff --git a/doc/html/fix_smd.html b/doc/html/fix_smd.html
index 5be23f5d7b..77238ccae4 100644
--- a/doc/html/fix_smd.html
+++ b/doc/html/fix_smd.html
@@ -168,10 +168,10 @@
 <div class="section" id="description">
 <h2>Description</h2>
 <p>This fix implements several options of steered MD (SMD) as reviewed in
-<a class="reference internal" href="#izrailev"><span class="std std-ref">(Izrailev)</span></a>, which allows to induce conformational changes
+<span class="xref std std-ref">(Izrailev)</span>, which allows to induce conformational changes
 in systems and to compute the potential of mean force (PMF) along the
-assumed reaction coordinate <a class="reference internal" href="#park"><span class="std std-ref">(Park)</span></a> based on Jarzynski&#8217;s
-equality <a class="reference internal" href="#jarzynski"><span class="std std-ref">(Jarzynski)</span></a>.  This fix borrows a lot from <a class="reference internal" href="fix_spring.html"><span class="doc">fix spring</span></a> and <a class="reference internal" href="fix_setforce.html"><span class="doc">fix setforce</span></a>.</p>
+assumed reaction coordinate <span class="xref std std-ref">(Park)</span> based on Jarzynski&#8217;s
+equality <span class="xref std std-ref">(Jarzynski)</span>.  This fix borrows a lot from <a class="reference internal" href="fix_spring.html"><span class="doc">fix spring</span></a> and <a class="reference internal" href="fix_setforce.html"><span class="doc">fix setforce</span></a>.</p>
 <p>You can apply a moving spring force to a group of atoms (<em>tether</em>
 style) or between two groups of atoms (<em>couple</em> style).  The spring
 can then be used in either constant velocity (<em>cvel</em>) mode or in
@@ -249,12 +249,14 @@ LAMMPS was built with that package.  See the <a class="reference internal" href=
 <a class="reference internal" href="fix_spring_rg.html"><span class="doc">fix spring/rg</span></a></p>
 <p><strong>Default:</strong> none</p>
 <hr class="docutils" />
-<p id="izrailev"><strong>(Izrailev)</strong> Izrailev, Stepaniants, Isralewitz, Kosztin, Lu, Molnar,
+<p id="israilev"><strong>(Izrailev)</strong> Izrailev, Stepaniants, Isralewitz, Kosztin, Lu, Molnar,
 Wriggers, Schulten. Computational Molecular Dynamics: Challenges,
 Methods, Ideas, volume 4 of Lecture Notes in Computational Science and
 Engineering, pp. 39-65. Springer-Verlag, Berlin, 1998.</p>
-<p id="park"><strong>(Park)</strong> Park, Schulten, J. Chem. Phys. 120 (13), 5946 (2004)</p>
-<p id="jarzynski"><strong>(Jarzynski)</strong> Jarzynski, Phys. Rev. Lett. 78, 2690 (1997)</p>
+<p><strong>(Park)</strong>
+Park, Schulten, J. Chem. Phys. 120 (13), 5946 (2004)</p>
+<p><strong>(Jarzynski)</strong>
+Jarzynski, Phys. Rev. Lett. 78, 2690 (1997)</p>
 </div>
 </div>
 
diff --git a/doc/html/fix_wall_piston.html b/doc/html/fix_wall_piston.html
index ade80b4497..8ebde444af 100644
--- a/doc/html/fix_wall_piston.html
+++ b/doc/html/fix_wall_piston.html
@@ -205,7 +205,7 @@ define the lattice spacings.</p>
 <h2>Restart, fix_modify, output, run start/stop, minimize info</h2>
 <p>No information about this fix is written to <a class="reference internal" href="restart.html"><span class="doc">binary restart files</span></a>.  None of the <a class="reference internal" href="fix_modify.html"><span class="doc">fix_modify</span></a> options
 are relevant to this fix.  No global or per-atom quantities are stored
-by this fix for access by various <a class="reference internal" href="Section_howto.html#howto-15"><span class="std std-ref">output commands</span></a>.  No parameter of this fix can
+by this fix for access by various <span class="xref std std-ref">output commands</span>.  No parameter of this fix can
 be used with the <em>start/stop</em> keywords of the <a class="reference internal" href="run.html"><span class="doc">run</span></a> command.
 This fix is not invoked during <a class="reference internal" href="minimize.html"><span class="doc">energy minimization</span></a>.</p>
 </div>
diff --git a/doc/html/kspace_style.html b/doc/html/kspace_style.html
index 1d279e5f3e..b0f9946dac 100644
--- a/doc/html/kspace_style.html
+++ b/doc/html/kspace_style.html
@@ -132,7 +132,7 @@
 </pre></div>
 </div>
 <ul class="simple">
-<li>style = <em>none</em> or <em>ewald</em> or <em>ewald/disp</em> or <em>ewald/omp</em> or <em>pppm</em> or <em>pppm/cg</em> or <em>pppm/disp</em> or <em>pppm/tip4p</em> or <em>pppm/stagger</em> or <em>pppm/disp/tip4p</em> or <em>pppm/gpu</em> or <em>pppm/omp</em> or <em>pppm/cg/omp</em> or <em>pppm/tip4p/omp</em> or <em>msm</em> or <em>msm/cg</em> or <em>msm/omp</em> or <em>msm/cg/omp</em></li>
+<li>style = <em>none</em> or <em>ewald</em> or <em>ewald/disp</em> or <em>ewald/omp</em> or <em>pppm</em> or <em>pppm/cg</em> or <em>pppm/disp</em> or <em>pppm/tip4p</em> or <em>pppm/stagger</em> or <em>pppm/disp/tip4p</em> or <em>pppm/gpu</em> or <em>pppm/kk</em> or <em>pppm/omp</em> or <em>pppm/cg/omp</em> or <em>pppm/tip4p/omp</em> or <em>msm</em> or <em>msm/cg</em> or <em>msm/omp</em> or <em>msm/cg/omp</em></li>
 </ul>
 <pre class="literal-block">
 <em>none</em> value = none
@@ -155,6 +155,8 @@
   accuracy = desired relative error in forces
 <em>pppm/gpu</em> value = accuracy
   accuracy = desired relative error in forces
+<em>pppm/kk</em> value = accuracy
+  accuracy = desired relative error in forces
 <em>pppm/omp</em> value = accuracy
   accuracy = desired relative error in forces
 <em>pppm/cg/omp</em> value = accuracy
@@ -195,7 +197,7 @@ style, the cutoff for Coulombic or 1/r^N interactions is effectively
 infinite.  If the Coulombic case, this means each charge in the system
 interacts with charges in an infinite array of periodic images of the
 simulation domain.</p>
-<p>Note that using a long-range solver requires use of a matching <a class="reference internal" href="pair_style.html"><span class="doc">pair style</span></a> to perform consistent short-range pairwise
+<p>Note that using a long-range solver requires use of a matching <span class="xref doc">pair style</span> to perform consistent short-range pairwise
 calculations.  This means that the name of the pair style contains a
 matching keyword to the name of the KSpace style, as in this table:</p>
 <table border="1" class="docutils">
@@ -385,6 +387,9 @@ If <em>pppm/gpu</em> is used with a GPU-enabled pair style, part of the PPPM
 calculation can be performed concurrently on the GPU while other
 calculations for non-bonded and bonded force calculation are performed
 on the CPU.</p>
+<p>The <em>pppm/kk</em> style also performs charge assignment and force
+interpolation calculations on the GPU while the FFTs themselves are
+calculated on the CPU in non-threaded mode.</p>
 <p>These accelerated styles are part of the GPU, USER-INTEL,
 KOKKOS, USER-OMP, and OPT packages respectively.  They are only
 enabled if LAMMPS was built with those packages.  See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
@@ -406,10 +411,10 @@ the KSPACE package is installed by default.</p>
 <p>For MSM, a simulation must be 3d and one can use any combination of
 periodic, non-periodic, or shrink-wrapped boundaries (specified using
 the <a class="reference internal" href="boundary.html"><span class="doc">boundary</span></a> command).</p>
-<p>For Ewald and PPPM, a simulation must be 3d and periodic in all dimensions.
-The only exception is if the slab option is set with <a class="reference internal" href="kspace_modify.html"><span class="doc">kspace_modify</span></a>,
-in which case the xy dimensions must be periodic and the z dimension must be
-non-periodic.</p>
+<p>For Ewald and PPPM, a simulation must be 3d and periodic in all
+dimensions.  The only exception is if the slab option is set with
+<a class="reference internal" href="kspace_modify.html"><span class="doc">kspace_modify</span></a>, in which case the xy dimensions
+must be periodic and the z dimension must be non-periodic.</p>
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
@@ -435,9 +440,10 @@ and Computation 5, 2322 (2009)</p>
 <p id="veld"><strong>(Veld)</strong> In &#8216;t Veld, Ismail, Grest, J Chem Phys, 127, 144711 (2007).</p>
 <p id="toukmaji"><strong>(Toukmaji)</strong> Toukmaji, Sagui, Board, and Darden, J Chem Phys, 113,
 10913 (2000).</p>
-<p id="isele-holder"><strong>(Isele-Holder)</strong> Isele-Holder, Mitchell, Ismail, J Chem Phys, 137, 174107 (2012).</p>
-<p id="isele-holder2"><strong>(Isele-Holder2)</strong> Isele-Holder, Mitchell, Hammond, Kohlmeyer, Ismail, J Chem Theory
-Comput 9, 5412 (2013).</p>
+<p id="isele-holder"><strong>(Isele-Holder)</strong> Isele-Holder, Mitchell, Ismail, J Chem Phys, 137,
+174107 (2012).</p>
+<p id="isele-holder2"><strong>(Isele-Holder2)</strong> Isele-Holder, Mitchell, Hammond, Kohlmeyer, Ismail,
+J Chem Theory Comput 9, 5412 (2013).</p>
 <p id="hardy"><strong>(Hardy)</strong> David Hardy thesis: Multilevel Summation for the Fast
 Evaluation of Forces for the Simulation of Biomolecules, University of
 Illinois at Urbana-Champaign, (2006).</p>
diff --git a/doc/html/molecule.html b/doc/html/molecule.html
index 094f231291..85087417db 100644
--- a/doc/html/molecule.html
+++ b/doc/html/molecule.html
@@ -297,7 +297,7 @@ manner.</p>
 <p class="last">Whether a section is required depends on how the molecule
 template is used by other LAMMPS commands.  For example, to add a
 molecule via the <a class="reference internal" href="fix_deposit.html"><span class="doc">fix deposit</span></a> command, the Coords
-and Types sections are required.  To add a rigid body via the <a class="reference internal" href="fix_pour.html"><span class="doc">fix pour</span></a> command, the Bonds (Angles, etc) sections are not
+and Types sections are required.  To add a rigid body via the <span class="xref doc">fix pour</span> command, the Bonds (Angles, etc) sections are not
 required, since the molecule will be treated as a rigid body.  Some
 sections are optional.  For example, the <a class="reference internal" href="fix_pour.html"><span class="doc">fix pour</span></a>
 command can be used to add &#8220;molecules&#8221; which are clusters of
diff --git a/doc/html/neighbor.html b/doc/html/neighbor.html
index 65a47dfcc1..d1c2630e8b 100644
--- a/doc/html/neighbor.html
+++ b/doc/html/neighbor.html
@@ -192,7 +192,7 @@ are printed to the screen and log file.  See <a class="reference internal" href=
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
 <p><a class="reference internal" href="neigh_modify.html"><span class="doc">neigh_modify</span></a>, <a class="reference internal" href="units.html"><span class="doc">units</span></a>,
-<a class="reference internal" href="comm_modify.html"><span class="doc">comm_modify</span></a></p>
+<span class="xref doc">comm_modify</span></p>
 </div>
 <div class="section" id="default">
 <h2>Default</h2>
diff --git a/doc/html/pair_brownian.html b/doc/html/pair_brownian.html
index eaf0a723c3..0f4bca68e6 100644
--- a/doc/html/pair_brownian.html
+++ b/doc/html/pair_brownian.html
@@ -227,7 +227,7 @@ to be specified in an input script that reads a restart file.</p>
 <div class="section" id="restrictions">
 <h2>Restrictions</h2>
 <p>These styles are part of the COLLOID package.  They are only enabled
-if LAMMPS was built with that package.  See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
+if LAMMPS was built with that package.  See the <span class="xref std std-ref">Making LAMMPS</span> section for more info.</p>
 <p>Only spherical monodisperse particles are allowed for pair_style
 brownian.</p>
 <p>Only spherical particles are allowed for pair_style brownian/poly.</p>
diff --git a/doc/html/pair_dipole.html b/doc/html/pair_dipole.html
index a8b5437180..2325cfa256 100644
--- a/doc/html/pair_dipole.html
+++ b/doc/html/pair_dipole.html
@@ -268,15 +268,17 @@ dipole interactions.  The long-range portion is calculated by using
 <em>ewald_disp</em> of the <a class="reference internal" href="kspace_style.html"><span class="doc">kspace_style</span></a> command. If
 <em>flag_coul</em> is set to <em>off</em>, Coulombic and dipole interactions are not
 computed at all.</p>
-<p>Atoms with dipole moments should be integrated using the <a class="reference internal" href="fix_nve_sphere.html"><span class="doc">fix nve/sphere update dipole</span></a> or the <a class="reference internal" href="fix_nvt_sphere.html"><span class="doc">fix nvt/sphere update dipole</span></a> command to rotate the
+<p>Atoms with dipole moments should be integrated using the <a class="reference internal" href="fix_nve_sphere.html"><span class="doc">fix nve/sphere update dipole</span></a> command to rotate the
 dipole moments.  The <em>omega</em> option on the <a class="reference internal" href="fix_langevin.html"><span class="doc">fix langevin</span></a> command can be used to thermostat the
 rotational motion.  The <a class="reference internal" href="compute_temp_sphere.html"><span class="doc">compute temp/sphere</span></a>
 command can be used to monitor the temperature, since it includes
-rotational degrees of freedom.  The <a class="reference internal" href="atom_style.html"><span class="doc">atom_style hybrid dipole sphere</span></a> command should be used since
-it defines the point dipoles and their rotational state.
-The magnitude and orientation of the dipole moment for each particle
-can be defined by the <a class="reference internal" href="set.html"><span class="doc">set</span></a> command or in the &#8220;Atoms&#8221; section
-of the data file read in by the <a class="reference internal" href="read_data.html"><span class="doc">read_data</span></a> command.</p>
+rotational degrees of freedom.  The <a class="reference internal" href="atom_style.html"><span class="doc">atom_style dipole</span></a> command should be used since it defines the
+point dipoles and their rotational state.  The magnitude of the dipole
+moment for each type of particle can be defined by the
+<span class="xref doc">dipole</span> command or in the &#8220;Dipoles&#8221; section of the data
+file read in by the <a class="reference internal" href="read_data.html"><span class="doc">read_data</span></a> command.  Their initial
+orientation can be defined by the <a class="reference internal" href="set.html"><span class="doc">set dipole</span></a> command or in
+the &#8220;Atoms&#8221; section of the data file.</p>
 <p>The following coefficients must be defined for each pair of atoms
 types via the <a class="reference internal" href="pair_coeff.html"><span class="doc">pair_coeff</span></a> command as in the examples
 above, or in the data file or restart files read by the
@@ -346,8 +348,7 @@ currently supported.</p>
 </div>
 <div class="section" id="related-commands">
 <h2>Related commands</h2>
-<p><a class="reference internal" href="pair_coeff.html"><span class="doc">pair_coeff</span></a>, <a class="reference internal" href="set.html"><span class="doc">set</span></a>, <a class="reference internal" href="read_data.html"><span class="doc">read_data</span></a>,
-<a class="reference internal" href="fix_nve_sphere.html"><span class="doc">fix nve/sphere</span></a>, <a class="reference internal" href="fix_nvt_sphere.html"><span class="doc">fix nvt/sphere</span></a></p>
+<p><a class="reference internal" href="pair_coeff.html"><span class="doc">pair_coeff</span></a></p>
 <p><strong>Default:</strong> none</p>
 <hr class="docutils" />
 <p id="allen"><strong>(Allen)</strong> Allen and Tildesley, Computer Simulation of Liquids,
diff --git a/doc/html/pair_dpd_fdt.html b/doc/html/pair_dpd_fdt.html
index cd8033694c..a06112c8fc 100644
--- a/doc/html/pair_dpd_fdt.html
+++ b/doc/html/pair_dpd_fdt.html
@@ -230,7 +230,7 @@ specified.</p>
 <p>These commands are part of the USER-DPD package.  They are only
 enabled if LAMMPS was built with that package.  See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
 <p>Pair styles <em>dpd/fdt</em> and <em>dpd/fdt/energy</em> require use of the
-<a class="reference internal" href="comm_modify.html"><span class="doc">comm_modify vel yes</span></a> option so that velocites are
+<span class="xref doc">communicate vel yes</span> option so that velocites are
 stored by ghost atoms.</p>
 <p>Pair style <em>dpd/fdt/energy</em> requires <a class="reference internal" href="atom_style.html"><span class="doc">atom_style dpd</span></a>
 to be used in order to properly account for the particle internal
diff --git a/doc/html/pair_gayberne.html b/doc/html/pair_gayberne.html
index 0b02a49a51..297c77802a 100644
--- a/doc/html/pair_gayberne.html
+++ b/doc/html/pair_gayberne.html
@@ -288,7 +288,7 @@ enabled if LAMMPS was built with that package.  See the <a class="reference inte
 <p>These pair style require that atoms store torque and a quaternion to
 represent their orientation, as defined by the
 <a class="reference internal" href="atom_style.html"><span class="doc">atom_style</span></a>.  It also require they store a per-type
-<a class="reference internal" href="set.html"><span class="doc">shape</span></a>.  The particles cannot store a per-particle
+<span class="xref doc">shape</span>.  The particles cannot store a per-particle
 diameter.</p>
 <p>This pair style requires that atoms be ellipsoids as defined by the
 <a class="reference internal" href="atom_style.html"><span class="doc">atom_style ellipsoid</span></a> command.</p>
diff --git a/doc/html/pair_line_lj.html b/doc/html/pair_line_lj.html
index f36f573822..f1fb92c17f 100644
--- a/doc/html/pair_line_lj.html
+++ b/doc/html/pair_line_lj.html
@@ -238,7 +238,7 @@ shift, table, and tail options.</p>
 <div class="section" id="restrictions">
 <h2>Restrictions</h2>
 <p>This style is part of the ASPHERE package.  It is only enabled if
-LAMMPS was built with that package.  See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
+LAMMPS was built with that package.  See the <span class="xref std std-ref">Making LAMMPS</span> section for more info.</p>
 <p>Defining particles to be line segments so they participate in
 line/line or line/particle interactions requires the use the
 <a class="reference internal" href="atom_style.html"><span class="doc">atom_style line</span></a> command.</p>
diff --git a/doc/html/pair_thole.html b/doc/html/pair_thole.html
index b2742247f9..a6baecd4b6 100644
--- a/doc/html/pair_thole.html
+++ b/doc/html/pair_thole.html
@@ -180,7 +180,7 @@ it also allows for mixing pair coefficients instead of listing them all.
 The <em>lj/cut/thole/long</em> pair style is also a bit faster because it avoids an
 overlay and can benefit from OMP acceleration. Moreover, it uses a more
 precise approximation of the direct Coulomb interaction at short range similar
-to <a class="reference internal" href="pair_cs.html"><span class="doc">coul/long/cs</span></a>, which stabilizes the temperature of
+to <span class="xref doc">coul/long/cs</span>, which stabilizes the temperature of
 Drude particles.</p>
 <p>The <em>thole</em> pair styles compute the Coulomb interaction damped at
 short distances by a function</p>
diff --git a/doc/html/pair_tri_lj.html b/doc/html/pair_tri_lj.html
index e6c6b24ddd..00e10b9632 100644
--- a/doc/html/pair_tri_lj.html
+++ b/doc/html/pair_tri_lj.html
@@ -210,7 +210,7 @@ shift, table, and tail options.</p>
 <div class="section" id="restrictions">
 <h2>Restrictions</h2>
 <p>This style is part of the ASPHERE package.  It is only enabled if
-LAMMPS was built with that package.  See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
+LAMMPS was built with that package.  See the <span class="xref std std-ref">Making LAMMPS</span> section for more info.</p>
 <p>Defining particles to be triangles so they participate in tri/tri or
 tri/particle interactions requires the use the <a class="reference internal" href="atom_style.html"><span class="doc">atom_style tri</span></a> command.</p>
 </div>
diff --git a/doc/html/python.html b/doc/html/python.html
index f78a9df834..2962d7dcd9 100644
--- a/doc/html/python.html
+++ b/doc/html/python.html
@@ -554,7 +554,7 @@ building LAMMPS.  LAMMPS must also be built as a shared library and
 your Python function must be able to to load the Python module in
 python/lammps.py that wraps the LAMMPS library interface.  These are
 the same steps required to use Python by itself to wrap LAMMPS.
-Details on these steps are explained in <a class="reference internal" href="Section_python.html"><span class="doc">Section python</span></a>.  Note that it is important that the
+Details on these steps are explained in <span class="xref doc">Section python</span>.  Note that it is important that the
 stand-alone LAMMPS executable and the LAMMPS shared library be
 consistent (built from the same source code files) in order for this
 to work.  If the two have been built at different times using
diff --git a/doc/html/rerun.html b/doc/html/rerun.html
index dc973298a6..520ea38241 100644
--- a/doc/html/rerun.html
+++ b/doc/html/rerun.html
@@ -174,7 +174,7 @@ initial simulation produced the dump file:</p>
 <li>Calculate one or more diagnostic quantities on the snapshots that
 weren&#8217;t computed in the initial run.  These can also be computed with
 settings not used in the initial run, e.g. computing an RDF via the
-<a class="reference internal" href="compute_rdf.html"><span class="doc">compute rdf</span></a> command with a longer cutoff than was
+<span class="xref doc">compute rdf</span> command with a longer cutoff than was
 used initially.</li>
 <li>Calculate the portion of per-atom forces resulting from a subset of
 the potential.  E.g. compute only Coulombic forces.  This can be done
diff --git a/doc/html/variable.html b/doc/html/variable.html
index f5eb258506..7eb3382f6c 100644
--- a/doc/html/variable.html
+++ b/doc/html/variable.html
@@ -1192,7 +1192,7 @@ with a leading $ sign (e.g. $x or ${abc}) versus with a leading &#8220;<a href="
 (e.g. v_x or v_abc).  The former can be used in any input script
 command, including a variable command.  The input script parser
 evaluates the reference variable immediately and substitutes its value
-into the command.  As explained in <a class="reference internal" href="Section_commands.html#cmd-2"><span class="std std-ref">Section commands 3.2</span></a> for &#8220;Parsing rules&#8221;, you can also use
+into the command.  As explained in <span class="xref std std-ref">Section commands 3.2</span> for &#8220;Parsing rules&#8221;, you can also use
 un-named &#8220;immediate&#8221; variables for this purpose.  For example, a
 string like this $((xlo+xhi)/2+sqrt(v_area)) in an input script
 command evaluates the string between the parenthesis as an equal-style
diff --git a/doc/src/Section_howto.txt b/doc/src/Section_howto.txt
index d54d0b2c3f..a680b24529 100644
--- a/doc/src/Section_howto.txt
+++ b/doc/src/Section_howto.txt
@@ -1010,7 +1010,7 @@ system.  Fix nvt/sllod uses "compute
 temp/deform"_compute_temp_deform.html to compute a thermal temperature
 by subtracting out the streaming velocity of the shearing atoms.  The
 velocity profile or other properties of the fluid can be monitored via
-the "fix ave/chunk"_fix_ave_chunk.html command.
+the "fix ave/spatial"_fix_ave_spatial.html command.
 
 As discussed in the previous section on non-orthogonal simulation
 boxes, the amount of tilt or skew that can be applied is limited by
@@ -1970,7 +1970,7 @@ on each of two regions to add/subtract specified amounts of energy to
 both regions.  In both cases, the resulting temperatures of the two
 regions can be monitored with the "compute temp/region" command and
 the temperature profile of the intermediate region can be monitored
-with the "fix ave/chunk"_fix_ave_chunk.html and "compute
+with the "fix ave/spatial"_fix_ave_spatial.html and "compute
 ke/atom"_compute_ke_atom.html commands.
 
 The third method is to perform a reverse non-equilibrium MD simulation
@@ -1979,7 +1979,7 @@ command which implements the rNEMD algorithm of Muller-Plathe.
 Kinetic energy is swapped between atoms in two different layers of the
 simulation box.  This induces a temperature gradient between the two
 layers which can be monitored with the "fix
-ave/chunk"_fix_ave_chunk.html and "compute
+ave/spatial"_fix_ave_spatial.html and "compute
 ke/atom"_compute_ke_atom.html commands.  The fix tallies the
 cumulative energy transfer that it performs.  See the "fix
 thermal/conductivity"_fix_thermal_conductivity.html command for
@@ -2036,7 +2036,7 @@ velocity to prevent the fluid from heating up.
 
 In both cases, the velocity profile setup in the fluid by this
 procedure can be monitored by the "fix
-ave/chunk"_fix_ave_chunk.html command, which determines
+ave/spatial"_fix_ave_spatial.html command, which determines
 grad(Vstream) in the equation above.  E.g. the derivative in the
 y-direction of the Vx component of fluid motion or grad(Vstream) =
 dVx/dy.  The Pxy off-diagonal component of the pressure or stress
@@ -2050,7 +2050,7 @@ using the "fix viscosity"_fix_viscosity.html command which implements
 the rNEMD algorithm of Muller-Plathe.  Momentum in one dimension is
 swapped between atoms in two different layers of the simulation box in
 a different dimension.  This induces a velocity gradient which can be
-monitored with the "fix ave/chunk"_fix_ave_chunk.html command.
+monitored with the "fix ave/spatial"_fix_ave_spatial.html command.
 The fix tallies the cummulative momentum transfer that it performs.
 See the "fix viscosity"_fix_viscosity.html command for details.
 
diff --git a/doc/src/Section_intro.txt b/doc/src/Section_intro.txt
index 99751a13ff..aa31153386 100644
--- a/doc/src/Section_intro.txt
+++ b/doc/src/Section_intro.txt
@@ -260,7 +260,7 @@ calculate "virtual diffraction patterns"_compute_xrd.html
 coupled rigid body integration via the "POEMS"_fix_poems.html library
 "QM/MM coupling"_fix_qmmm.html
 "path-integral molecular dynamics (PIMD)"_fix_ipi.html and "this as well"_fix_pimd.html
-Monte Carlo via "GCMC"_fix_gcmc.html and "tfMC"_fix_tfmc.html "atom swapping"_fix_atom_swap.html and "bond swapping"_fix_bond_swap.html
+Monte Carlo via "GCMC"_fix_gcmc.html and "tfMC"_fix_tfmc.html and "atom swapping"_fix_swap.html
 "Direct Simulation Monte Carlo"_pair_dsmc.html for low-density fluids
 "Peridynamics mesoscale modeling"_pair_peri.html
 "Lattice Boltzmann fluid"_fix_lb_fluid.html
diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index 5282eaa4bb..6be763cb70 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -795,7 +795,7 @@ Supporting info:
 "doc/PDF/PDLammps_overview.pdf"_PDF/PDLammps_overview.pdf,
 "doc/PDF/PDLammps_EPS.pdf"_PDF/PDLammps_EPS.pdf,
 "doc/PDF/PDLammps_VES.pdf"_PDF/PDLammps_VES.pdf, "atom_style
-peri"_atom_style.html, "compute damage/atom"_compute_damage_atom.html,
+peri"_atom_style.html, "compute damage"_compute_damage.html,
 "pair_style peri/pmb"_pair_peri.html, examples/peri
  
 :line
@@ -846,8 +846,9 @@ PYTHON package :link(PYTHON),h5
 Contents: A "python"_python.html command which allow you to execute
 Python code from a LAMMPS input script.  The code can be in a separate
 file or embedded in the input script itself.  See "Section python
-11.2"_Section_python.html" for an overview of using Python from
-LAMMPS and for other ways to use LAMMPS and Python together.
+11.2"_Section_python.html#py_2" for an overview of using Python from
+LAMMPS and "Section python"_Section_python.html" for other ways to use
+LAMMPS and Python together.
 
 Building with the PYTHON package assumes you have a Python shared
 library available on your system, which needs to be a Python 2
@@ -1002,7 +1003,7 @@ make machine :pre
 Make.py -p ^rigid -a machine :pre
 
 Supporting info: "compute erotate/rigid"_compute_erotate_rigid.html,
-"fix shake"_fix_shake.html, "fix rattle"_fix_shake.html, "fix
+"fix shake"_fix_shake.html, "fix rattle"_fix_rattle.html, "fix
 rigid/*"_fix_rigid.html, examples/ASPHERE, examples/rigid
 
 :line
@@ -1054,8 +1055,8 @@ make machine :pre
 Make.py -p ^snap -a machine :pre
 
 Supporting info: "pair snap"_pair_snap.html, "compute
-sna/atom"_compute_sna_atom.html, "compute snad/atom"_compute_sna_atom.html,
-"compute snav/atom"_compute_sna_atom.html, examples/snap
+sna/atom"_compute_sna_atom.html, "compute snad/atom"_compute_sna.html,
+"compute snav/atom"_compute_sna.html, examples/snap
  
 :line
  
@@ -1163,8 +1164,8 @@ Package, Description, Author(s), Doc page, Example, Pic/movie, Library
 "USER-SMD"_#USER-SMD, smoothed Mach dynamics, Georg Ganzenmuller (EMI), "userguide.pdf"_PDF/SMD_LAMMPS_userguide.pdf, USER/smd, -, -
 "USER-SMTBQ"_#USER-SMTBQ, Second Moment Tight Binding - QEq potential, Salles & Maras & Politano & Tetot (4), "pair_style smtbq"_pair_smtbq.html, USER/smtbq, -, -
 "USER-SPH"_#USER-SPH, smoothed particle hydrodynamics, Georg Ganzenmuller (EMI), "userguide.pdf"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, "sph"_sph, -
-"USER-TALLY"_#USER-TALLY, Pairwise tallied computes, Axel Kohlmeyer (Temple U), "compute XXX/tally"_compute_tally.html, USER/tally, -, -
-"USER-VTK"_#USER-VTK, VTK-style dumps, Berger and Queteschiner (6), "compute custom/vtk"_dump_custom_vtk.html, -, -, lib/vtk
+"USER-TALLY"_#USER-TALLY, Pairwise tallied computes, Axel Kohlmeyer (Temple U), "compute <...>/tally"_compute_tally.html, USER/tally, -, -
+"USER-VTK"_#USER-VTK, VTK-style dumps, Berger and Queteschiner (6), "compute custom/vtk"_compute_custom_vtk.html, -, -, lib/vtk
 :tb(ea=c)
 
 :link(atc,http://lammps.sandia.gov/pictures.html#atc)
@@ -1259,7 +1260,7 @@ calculation.  See src/USER-AWPMD/README for more details.
 To build LAMMPS with this package ...
 
 Supporting info: src/USER-AWPMD/README, "fix
-awpmd/cut"_pair_awpmd.html, examples/USER/awpmd
+awpmd/cut"_pair_awpmd_cut.html, examples/USER/awpmd
 
 Author: Ilya Valuev at the JIHT in Russia (valuev at
 physik.hu-berlin.de).  Contact him directly if you have questions.
@@ -1314,7 +1315,7 @@ kinematic diffraction theory.  See src/USER-DIFFRACTION/README for
 more details.
 
 Supporting info: "compute saed"_compute_saed.html, "compute
-xrd"_compute_xrd.html, "fix saed/vtk"_fix_saed_vtk.html,
+xrd"_compute_xrd.html, "fix saed.vtk"_fix_saed_vtk.html,
 examples/USER/diffraction
 
 Author: Shawn P. Coleman (shawn.p.coleman8.ctr at mail.mil) while at
@@ -1336,11 +1337,10 @@ splitting algorithm.  See src/USER-DPD/README for more details.
 Supporting info: /src/USER-DPD/README, "compute dpd"_compute_dpd.html
 "compute dpd/atom"_compute_dpd_atom.html
 "fix eos/cv"_fix_eos_table.html "fix eos/table"_fix_eos_table.html
- "fix eos/table/rx"_fix_eos_table_rx.html "fix shardlow"_fix_shardlow.html
-"fix rx"_fix_rx.html "pair table/rx"_pair_table_rx.html
-"pair dpd/fdt"_pair_dpd_fdt.html "pair dpd/fdt/energy"_pair_dpd_fdt.html
-"pair exp6/rx"_pair_exp6_rx.html "pair multi/lucy"_pair_multi_lucy.html
-"pair multi/lucy/rx"_pair_multi_lucy_rx.html, examples/USER/dpd
+"fix shardlow"_fix_shardlow.html
+"pair_dpd/conservative"_pair_dpd_conservative.html
+"pair_dpd/fdt"_pair_dpd_fdt.html
+"pair_dpd/fdt/energy"_pair_dpd_fdt.html, examples/USER/dpd
 
 Authors: James Larentzos (ARL) (james.p.larentzos.civ at mail.mil),
 Timothy Mattox (Engility Corp) (Timothy.Mattox at engilitycorp.com)
@@ -1440,7 +1440,7 @@ Phi-enabled styles.
 
 Supporting info: examples/accelerate, src/USER-INTEL/TEST
 
-"Section_accelerate"_Section_accelerate.html#acc_3
+"Section_accelerate"_Section_accelerate.html#acc_9
 
 Author: Mike Brown at Intel (michael.w.brown at intel.com).  Contact
 him directly if you have questions.
@@ -1592,7 +1592,7 @@ application itself.
 
 See this doc page to get started:
 
-"dump molfile"_dump_molfile.html
+"dump molfile"_dump_molfile.html#acc_5
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
@@ -1609,7 +1609,7 @@ styles, and fix styles.
  
 See this section of the manual to get started:
 
-"Section_accelerate"_Section_accelerate.html#acc_3
+"Section_accelerate"_Section_accelerate.html#acc_5
 
 The person who created this package is Axel Kohlmeyer at Temple U
 (akohlmey at gmail.com).  Contact him directly if you have questions.
diff --git a/doc/src/Section_start.txt b/doc/src/Section_start.txt
index 0231fb3357..c1b0e796b6 100644
--- a/doc/src/Section_start.txt
+++ b/doc/src/Section_start.txt
@@ -1353,7 +1353,7 @@ Note that the keywords do not use a leading minus sign.  I.e. the
 keyword is "t", not "-t".  Also note that each of the keywords has a
 default setting.  Example of when to use these options and what
 settings to use on different platforms is given in "Section
-5.8"_Section_accelerate.html#acc_3.
+5.8"_Section_accelerate.html#acc_8.
 
 d or device
 g or gpus
diff --git a/doc/src/accelerate_kokkos.txt b/doc/src/accelerate_kokkos.txt
index a3d1c9efbd..b69bf8a3d9 100644
--- a/doc/src/accelerate_kokkos.txt
+++ b/doc/src/accelerate_kokkos.txt
@@ -246,7 +246,7 @@ used if running with KOKKOS_DEVICES=Pthreads for pthreads.  It is not
 necessary for KOKKOS_DEVICES=OpenMP for OpenMP, because OpenMP
 provides alternative methods via environment variables for binding
 threads to hardware cores.  More info on binding threads to cores is
-given in "this section"_Section_accelerate.html#acc_3.
+given in "this section"_Section_accelerate.html#acc_8.
 
 KOKKOS_ARCH=KNC enables compiler switches needed when compling for an
 Intel Phi processor.
diff --git a/doc/src/body.txt b/doc/src/body.txt
index 1cf9dff770..96debf8f81 100644
--- a/doc/src/body.txt
+++ b/doc/src/body.txt
@@ -175,6 +175,9 @@ The {bflag2} argument is ignored.
 
 [Specifics of body style rounded/polygon:]
 
+NOTE: Aug 2016 - This body style has not yet been added to LAMMPS.
+The info below is a placeholder.
+
 The {rounded/polygon} body style represents body particles as a convex
 polygon with a variable number N > 2 of vertices, which can only be
 used for 2d models.  One example use of this body style is for 2d
@@ -235,8 +238,9 @@ particles whose edge length is sqrt(2):
 0 1 1 2 2 3 3 0
 1.0 :pre
 
-The "pair_style body/rounded/polygon"_pair_body_rounded_polygon.html command
-can be used with this body style to compute body/body interactions.
+The "pair_style body/rounded/polygon"_pair_body_rounded_polygon.html
+command can be used with this body style to compute body/body
+interactions.
 
 For output purposes via the "compute
 body/local"_compute_body_local.html and "dump local"_dump.html
diff --git a/doc/src/compute_damage_atom.txt b/doc/src/compute_damage_atom.txt
index ab76cf82de..22be26b573 100644
--- a/doc/src/compute_damage_atom.txt
+++ b/doc/src/compute_damage_atom.txt
@@ -57,7 +57,7 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
-"compute dilatation/atom"_compute_dilatation_atom.html,
-"compute plasticity/atom"_compute_plasticity_atom.html
+"compute dilatation"_compute_dilatation.html, "compute
+plasticity"_compute_plasticity.html
 
 [Default:] none
diff --git a/doc/src/compute_dilatation_atom.txt b/doc/src/compute_dilatation_atom.txt
index 0bd27184e3..7d550e48f1 100644
--- a/doc/src/compute_dilatation_atom.txt
+++ b/doc/src/compute_dilatation_atom.txt
@@ -60,7 +60,7 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
-"compute damage/atom"_compute_damage_atom.html,
-"compute plasticity/atom"_compute_plasticity_atom.html
+"compute damage"_compute_damage.html, "compute
+plasticity"_compute_plasticity.html
 
 [Default:] none 
diff --git a/doc/src/compute_erotate_rigid.txt b/doc/src/compute_erotate_rigid.txt
index 58f511a624..3044449faf 100644
--- a/doc/src/compute_erotate_rigid.txt
+++ b/doc/src/compute_erotate_rigid.txt
@@ -56,6 +56,6 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
-"compute ke/rigid"_compute_ke_rigid.html
+"compute ke/rigid"_compute_erotate_ke_rigid.html
 
 [Default:] none
diff --git a/doc/src/compute_plasticity_atom.txt b/doc/src/compute_plasticity_atom.txt
index 5e6b86641f..ebea299b26 100644
--- a/doc/src/compute_plasticity_atom.txt
+++ b/doc/src/compute_plasticity_atom.txt
@@ -54,8 +54,8 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
-"compute damage/atom"_compute_damage_atom.html,
-"compute dilatation/atom"_compute_dilatation_atom.html
+"compute damage"_compute_damage.html, "compute
+dilatation"_compute_dilatation.html
 
 [Default:] none 
 
diff --git a/doc/src/compute_reduce.txt b/doc/src/compute_reduce.txt
index b28f94d969..09ab48b12f 100644
--- a/doc/src/compute_reduce.txt
+++ b/doc/src/compute_reduce.txt
@@ -93,7 +93,7 @@ means all indices from m to n (inclusive).
 
 Using a wildcard is the same as if the individual columns of the array
 had been listed one by one.  E.g. these 2 compute reduce commands are
-equivalent, since the "compute stress/atom"_compute_stress_atom.html
+equivalent, since the "compute stress/atom"_compute_stress/atom.html
 command creates a per-atom array with 6 columns:
 
 compute myPress all stress/atom NULL
diff --git a/doc/src/compute_temp_deform_eff.txt b/doc/src/compute_temp_deform_eff.txt
index d09a0ace2f..199fa5fdc4 100644
--- a/doc/src/compute_temp_deform_eff.txt
+++ b/doc/src/compute_temp_deform_eff.txt
@@ -26,12 +26,12 @@ nuclei and electrons in the "electron force field"_pair_eff.html
 model, after subtracting out a streaming velocity induced by the
 simulation box changing size and/or shape, for example in a
 non-equilibrium MD (NEMD) simulation.  The size/shape change is
-induced by use of the "fix deform"_fix_deform.html command.  A
+induced by use of the "fix deform/eff"_fix_deform_eff.html command.  A
 compute of this style is created by the "fix
 nvt/sllod/eff"_fix_nvt_sllod_eff.html command to compute the thermal
 temperature of atoms for thermostatting purposes.  A compute of this
 style can also be used by any command that computes a temperature,
-e.g. "thermo_modify"_thermo_modify.html, "fix npt/eff"_fix_nh_eff.html,
+e.g. "thermo_modify"_thermo_modify.html, "fix npt/eff"_fix_nh.html,
 etc.
 
 The calculation performed by this compute is exactly like that
@@ -66,7 +66,8 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 [Related commands:]
 
-"compute temp/ramp"_compute_temp_ramp.html, "fix deform"_fix_deform.html,
-"fix nvt/sllod/eff"_fix_nvt_sllod_eff.html
+"compute temp/ramp"_compute_temp_ramp.html, "fix
+deform/eff"_fix_deform_eff.html, "fix
+nvt/sllod/eff"_fix_nvt_sllod_eff.html
 
 [Default:] none
diff --git a/doc/src/compute_temp_eff.txt b/doc/src/compute_temp_eff.txt
index 905369fbfd..a3629fef87 100644
--- a/doc/src/compute_temp_eff.txt
+++ b/doc/src/compute_temp_eff.txt
@@ -26,7 +26,7 @@ Define a computation that calculates the temperature of a group of
 nuclei and electrons in the "electron force field"_pair_eff.html
 model.  A compute of this style can be used by commands that compute a
 temperature, e.g. "thermo_modify"_thermo_modify.html, "fix
-npt/eff"_fix_nh_eff.html, etc.
+npt/eff"_fix_npt_eff.html, etc.
 
 The temperature is calculated by the formula KE = dim/2 N k T, where
 KE = total kinetic energy of the group of atoms (sum of 1/2 m v^2 for
diff --git a/doc/src/dihedral_table.txt b/doc/src/dihedral_table.txt
index bad6ad2c0d..2df3ad0b47 100644
--- a/doc/src/dihedral_table.txt
+++ b/doc/src/dihedral_table.txt
@@ -195,7 +195,7 @@ more instructions on how to use the accelerated styles effectively.
 [Restrictions:]
 
 This dihedral style can only be used if LAMMPS was built with the
-USER-MISC package.  See the "Making LAMMPS"_Section_start.html#start_3
+USER-MISC package.  See the "Making LAMMPS"_Section_start.html#2_3
 section for more info on packages.
 
 [Related commands:]
diff --git a/doc/src/dump.txt b/doc/src/dump.txt
index ffeef1b0f6..5b3508ed69 100644
--- a/doc/src/dump.txt
+++ b/doc/src/dump.txt
@@ -441,7 +441,7 @@ from m to n (inclusive).
 
 Using a wildcard is the same as if the individual columns of the array
 had been listed one by one.  E.g. these 2 dump commands are
-equivalent, since the "compute stress/atom"_compute_stress_atom.html
+equivalent, since the "compute stress/atom"_compute_stress/atom.html
 command creates a per-atom array with 6 columns:
 
 compute myPress all stress/atom NULL
diff --git a/doc/src/dump_custom_vtk.txt b/doc/src/dump_custom_vtk.txt
index c2408bab90..7db9390200 100644
--- a/doc/src/dump_custom_vtk.txt
+++ b/doc/src/dump_custom_vtk.txt
@@ -34,7 +34,7 @@ args = list of arguments for a particular style :l
       proc = ID of processor that owns atom
       procp1 = ID+1 of processor that owns atom
       type = atom type
-      element = name of atom element, as defined by "dump_modify"_dump_modify.html command
+      element = name of atom element, as defined by "dump_modify"_dump_modify_vtk.html command
       mass = atom mass
       x,y,z = unscaled atom coordinates
       xs,ys,zs = scaled atom coordinates
@@ -71,7 +71,7 @@ for the XML format; see the "VTK
 homepage"_http://www.vtk.org/VTK/img/file-formats.pdf for a detailed
 description of these formats.  Since this naming convention conflicts
 with the way binary output is usually specified (see below),
-"dump_modify binary"_dump_modify.html allows to set the binary
+"dump_modify binary"_dump_modify_vtk.html allows to set the binary
 flag for this dump style explicitly.
 
 [Description:]
@@ -81,10 +81,10 @@ timesteps in a format readable by the "VTK visualization
 toolkit"_http://www.vtk.org or other visualization tools that use it,
 e.g. "ParaView"_http://www.paraview.org.  The timesteps on which dump
 output is written can also be controlled by a variable; see the
-"dump_modify every"_dump_modify.html command for details.
+"dump_modify every"_dump_modify_vtk.html command for details.
 
 Only information for atoms in the specified group is dumped.  The
-"dump_modify thresh and region"_dump_modify.html commands can also
+"dump_modify thresh and region"_dump_modify_vtk.html commands can also
 alter what atoms are included; see details below.
 
 As described below, special characters ("*", "%") in the filename
@@ -95,7 +95,7 @@ on timesteps when neighbor lists are rebuilt, the coordinates of an
 atom written to a dump file may be slightly outside the simulation
 box.
 
-IMPORTANT NOTE: Unless the "dump_modify sort"_dump_modify.html
+IMPORTANT NOTE: Unless the "dump_modify sort"_dump_modify_vtk.html
 option is invoked, the lines of atom information written to dump files
 will be in an indeterminate order for each snapshot.  This is even
 true when running on a single processor, if the "atom_modify
@@ -106,7 +106,7 @@ data for a single snapshot is collected from multiple processors, each
 of which owns a subset of the atoms.
 
 For the {custom/vtk} style, sorting is off by default. See the
-"dump_modify"_dump_modify.html doc page for details.
+"dump_modify"_dump_modify_vtk.html doc page for details.
 
 :line
 
@@ -148,21 +148,21 @@ timestep 0) and on the last timestep of a minimization if the
 minimization converges.  Note that this means a dump will not be
 performed on the initial timestep after the dump command is invoked,
 if the current timestep is not a multiple of N.  This behavior can be
-changed via the "dump_modify first"_dump_modify.html command, which
+changed via the "dump_modify first"_dump_modify_vtk.html command, which
 can also be useful if the dump command is invoked after a minimization
 ended on an arbitrary timestep.  N can be changed between runs by
-using the "dump_modify every"_dump_modify.html command.
-The "dump_modify every"_dump_modify.html command
+using the "dump_modify every"_dump_modify_vtk.html command.
+The "dump_modify every"_dump_modify_vtk.html command
 also allows a variable to be used to determine the sequence of
 timesteps on which dump files are written.  In this mode a dump on the
 first timestep of a run will also not be written unless the
-"dump_modify first"_dump_modify.html command is used.
+"dump_modify first"_dump_modify_vtk.html command is used.
 
 Dump filenames can contain two wildcard characters.  If a "*"
 character appears in the filename, then one file per snapshot is
 written and the "*" character is replaced with the timestep value.
 For example, tmp.dump*.vtk becomes tmp.dump0.vtk, tmp.dump10000.vtk,
-tmp.dump20000.vtk, etc.  Note that the "dump_modify pad"_dump_modify.html
+tmp.dump20000.vtk, etc.  Note that the "dump_modify pad"_dump_modify_vtk.html
 command can be used to insure all timestep numbers are the same length
 (e.g. 00010), which can make it easier to read a series of dump files
 in order with some post-processing tools.
@@ -176,7 +176,7 @@ mode of output on parallel machines that support parallel I/O for output.
 
 By default, P = the number of processors meaning one file per
 processor, but P can be set to a smaller value via the {nfile} or
-{fileper} keywords of the "dump_modify"_dump_modify.html command.
+{fileper} keywords of the "dump_modify"_dump_modify_vtk.html command.
 These options can be the most efficient way of writing out dump files
 when running on large numbers of processors.
 
@@ -202,7 +202,7 @@ The {id}, {mol}, {proc}, {procp1}, {type}, {element}, {mass}, {vx},
 {id} is the atom ID.  {mol} is the molecule ID, included in the data
 file for molecular systems.  {type} is the atom type.  {element} is
 typically the chemical name of an element, which you must assign to
-each type via the "dump_modify element"_dump_modify.html command.
+each type via the "dump_modify element"_dump_modify_vtk.html command.
 More generally, it can be any string you wish to associate with an
 atom type.  {mass} is the atom mass.  {vx}, {vy}, {vz}, {fx}, {fy},
 {fz}, and {q} are components of atom velocity and force and atomic
diff --git a/doc/src/dump_image.txt b/doc/src/dump_image.txt
index 826c3f992b..a7dd872d63 100644
--- a/doc/src/dump_image.txt
+++ b/doc/src/dump_image.txt
@@ -388,6 +388,9 @@ objects to be drawn.  An example is the "fix
 surface/global"_fix_surface_global.html command which can draw lines
 or triangles for 2d/3d simulations.
 
+NOTE: Aug 2016 - The fix surface/global command is not yet added to
+LAMMPS.
+
 The {fflag1} and {fflag2} settings are numerical values which are
 passed to the fix to affect how the drawing of its objects is done.
 See the individual fix doc page for a description of what these
diff --git a/doc/src/fix_ave_chunk.txt b/doc/src/fix_ave_chunk.txt
index d02fa17e10..796cf5e331 100644
--- a/doc/src/fix_ave_chunk.txt
+++ b/doc/src/fix_ave_chunk.txt
@@ -141,7 +141,7 @@ means all indices from m to n (inclusive).
 Using a wildcard is the same as if the individual columns of the array
 had been listed one by one.  E.g. these 2 fix ave/chunk commands are
 equivalent, since the "compute
-property/atom"_compute_property_atom.html command creates, in this
+property/atom"_compute_property/atom.html command creates, in this
 case, a per-atom array with 3 columns:
 
 compute myAng all property/atom angmomx angmomy angmomz
@@ -150,7 +150,7 @@ fix 2 all ave/chunk 100 1 100 cc1 c_myAng\[1\] c_myAng\[2\] c_myAng\[3\] file tm
 
 NOTE: This fix works by creating an array of size {Nchunk} by Nvalues
 on each processor.  {Nchunk} is the number of chunks which is defined
-by the "compute chunk/atom"_compute_chunk_atom.html command.
+by the "compute chunk/atom"_doc/compute_chunk_atom.html command.
 Nvalues is the number of input values specified.  Each processor loops
 over its atoms, tallying its values to the appropriate chunk.  Then
 the entire array is summed across all processors.  This means that
diff --git a/doc/src/fix_bond_create.txt b/doc/src/fix_bond_create.txt
index 85cef5071a..dac50154d0 100755
--- a/doc/src/fix_bond_create.txt
+++ b/doc/src/fix_bond_create.txt
@@ -122,7 +122,7 @@ become one moleclue due to the created bond, all atoms in the new
 moleclue retain their original molecule IDs.
 
 If the {atype} keyword is used and if an angle potential is defined
-via the "angle_style"_angle_style.html command, then any new 3-body
+via the "angle_style"_angle.html command, then any new 3-body
 interactions inferred by the creation of a bond will create new angles
 of type {angletype}, with parameters assigned by the corresponding
 "angle_coeff"_angle_coeff.html command.  Likewise, the {dtype} and
diff --git a/doc/src/fix_deform.txt b/doc/src/fix_deform.txt
index 6b3118b74f..2f86440664 100644
--- a/doc/src/fix_deform.txt
+++ b/doc/src/fix_deform.txt
@@ -487,7 +487,7 @@ NOTE: For non-equilibrium MD (NEMD) simulations using "remap v" it is
 usually desirable that the fluid (or flowing material, e.g. granular
 particles) stream with a velocity profile consistent with the
 deforming box.  As mentioned above, using a thermostat such as "fix
-nvt/sllod"_fix_nvt_sllod.html or "fix lavgevin"_fix_langevin.html
+nvt/sllod"_fix_nvt_sllod.html or "fix lavgevin"_doc/fix_langevin.html
 (with a bias provided by "compute
 temp/deform"_compute_temp_deform.html), will typically accomplish
 that.  If you do not use a thermostat, then there is no driving force
diff --git a/doc/src/fix_deposit.txt b/doc/src/fix_deposit.txt
index ea9c5643dd..ab8674c90e 100644
--- a/doc/src/fix_deposit.txt
+++ b/doc/src/fix_deposit.txt
@@ -147,7 +147,7 @@ the list of relative probabilities.  The N values must sum to 1.0.
 
 If you wish to insert molecules via the {mol} keyword, that will be
 treated as rigid bodies, use the {rigid} keyword, specifying as its
-value the ID of a separate "fix rigid/small"_fix_rigid.html
+value the ID of a separate "fix rigid/small"_fix_rigid_small.html
 command which also appears in your input script.
 
 If you wish to insert molecules via the {mol} keyword, that will have
diff --git a/doc/src/fix_ehex.txt b/doc/src/fix_ehex.txt
index df3ab4c687..f726bd984b 100644
--- a/doc/src/fix_ehex.txt
+++ b/doc/src/fix_ehex.txt
@@ -111,7 +111,7 @@ the keyword {hex} is specified.
 [Compatibility with SHAKE and RATTLE (rigid molecules)]:
 
 This fix is compatible with "fix shake"_fix_shake.html and "fix
-rattle"_fix_shake.html. If either of these constraining algorithms is
+rattle"_fix_rattle.html. If either of these constraining algorithms is
 specified in the input script and the keyword {constrain} is set, the
 bond distances will be corrected a second time at the end of the
 integration step.  It is recommended to specify the keyword {com} in
@@ -124,7 +124,7 @@ rescaling takes place if the centre of mass lies outside the region.
 NOTE: You can only use the keyword {com} along with {constrain}.
 
 To achieve the highest accuracy it is recommended to use "fix
-rattle"_fix_shake.html with the keywords {constrain} and {com} as
+rattle"_fix_rattle.html with the keywords {constrain} and {com} as
 shown in the second example. Only if RATTLE is employed, the velocity
 constraints will be satisfied.
 
diff --git a/doc/src/fix_eos_table.txt b/doc/src/fix_eos_table.txt
index e606f58a86..b6b7daacad 100644
--- a/doc/src/fix_eos_table.txt
+++ b/doc/src/fix_eos_table.txt
@@ -71,7 +71,7 @@ initial text must match the argument specified in the fix command.
 The next line lists the number of table entries.  The parameter "N" is
 required and its value is the number of table entries that follow.
 Note that this may be different than the {N} specified in the "fix
-eos/table"_fix_eos_table.html command.  Let Ntable = {N} in the fix
+eos/table"_fix_style.html command.  Let Ntable = {N} in the fix
 command, and Nfile = "N" in the tabulated file.  What LAMMPS does is a
 preliminary interpolation by creating splines using the Nfile
 tabulated values as nodal points.  It uses these to interpolate as
@@ -112,6 +112,6 @@ are not within the table cutoffs.
 
 [Related commands:]
 
-"fix shardlow"_fix_shardlow.html, "pair dpd/fdt"_pair_dpd_fdt.html
+"fix shardlow"_fix_shardlow.html, "pair dpd/fdt"_dpd_fdt.html
 
 [Default:] none
diff --git a/doc/src/fix_eos_table_rx.txt b/doc/src/fix_eos_table_rx.txt
index d87449c4c7..9c97151202 100644
--- a/doc/src/fix_eos_table_rx.txt
+++ b/doc/src/fix_eos_table_rx.txt
@@ -135,7 +135,7 @@ are not within the table cutoffs.
 [Related commands:]
 
 "fix rx"_fix_rx.html,
-"pair dpd/fdt"_pair_dpd_fdt.html
+"pair dpd/fdt"_dpd_fdt.html
 
 [Default:] none
 
diff --git a/doc/src/fix_lb_momentum.txt b/doc/src/fix_lb_momentum.txt
index be2f77543f..9e31bea784 100755
--- a/doc/src/fix_lb_momentum.txt
+++ b/doc/src/fix_lb_momentum.txt
@@ -50,7 +50,7 @@ No information about this fix is written to "binary restart
 files"_restart.html.  None of the "fix_modify"_fix_modify.html options
 are relevant to this fix.  No global or per-atom quantities are stored
 by this fix for access by various "output
-commands"_Section_howto.html#howto_15.  No parameter of this fix can be
+commands"_Section_howto.html#4_15.  No parameter of this fix can be
 used with the {start/stop} keywords of the "run"_run.html command.
 This fix is not invoked during "energy minimization"_minimize.html.
 
diff --git a/doc/src/fix_lb_pc.txt b/doc/src/fix_lb_pc.txt
index 81b2c4a284..fe8f03af80 100755
--- a/doc/src/fix_lb_pc.txt
+++ b/doc/src/fix_lb_pc.txt
@@ -35,7 +35,7 @@ No information about this fix is written to "binary restart
 files"_restart.html.  None of the "fix_modify"_fix_modify.html options
 are relevant to this fix.  No global or per-atom quantities are stored
 by this fix for access by various "output
-commands"_Section_howto.html#howto_15.  No parameter of this fix can be
+commands"_Section_howto.html#4_15.  No parameter of this fix can be
 used with the {start/stop} keywords of the "run"_run.html command.
 This fix is not invoked during "energy minimization"_minimize.html.
 
diff --git a/doc/src/fix_nh_eff.txt b/doc/src/fix_nh_eff.txt
index d90ce52caa..817d5786e8 100644
--- a/doc/src/fix_nh_eff.txt
+++ b/doc/src/fix_nh_eff.txt
@@ -75,7 +75,7 @@ doc page), are performed with computes that include the eFF contribution
 to the temperature or kinetic energy from the electron radial velocity.
 
 NOTE: there are two different pressures that can be reported for eFF
-when defining the pair_style (see "pair eff/cut"_pair_eff.html to
+when defining the pair_style (see "pair eff/cut"_pair_eff_cut.html to
 understand these settings), one (default) that considers electrons do
 not contribute radial virial components (i.e. electrons treated as
 incompressible 'rigid' spheres) and one that does.  The radial
diff --git a/doc/src/fix_phonon.txt b/doc/src/fix_phonon.txt
index 46f8f57c33..2d3baacb58 100644
--- a/doc/src/fix_phonon.txt
+++ b/doc/src/fix_phonon.txt
@@ -148,7 +148,7 @@ fix. You can use it to change the temperature compute from thermo_temp
 to the one that reflects the true temperature of atoms in the group.
 
 No global scalar or vector or per-atom quantities are stored by this
-fix for access by various "output commands"_Section_howto.html#howto_15.
+fix for access by various "output commands"_Section_howto.html#4_15.
 
 Instead, this fix outputs its initialization information (including
 mapping information) and the calculated dynamical matrices to the file
diff --git a/doc/src/fix_pour.txt b/doc/src/fix_pour.txt
index 6d5ed3ac06..cd1b1d6853 100644
--- a/doc/src/fix_pour.txt
+++ b/doc/src/fix_pour.txt
@@ -114,7 +114,7 @@ the list of relative probabilities.  The N values must sum to 1.0.
 
 If you wish to insert molecules via the {mol} keyword, that will be
 treated as rigid bodies, use the {rigid} keyword, specifying as its
-value the ID of a separate "fix rigid/small"_fix_rigid.html
+value the ID of a separate "fix rigid/small"_fix_rigid_small.html
 command which also appears in your input script.
 
 If you wish to insert molecules via the {mol} keyword, that will have
diff --git a/doc/src/fix_reax_bonds.txt b/doc/src/fix_reax_bonds.txt
index a0396ce210..cc8966db6f 100644
--- a/doc/src/fix_reax_bonds.txt
+++ b/doc/src/fix_reax_bonds.txt
@@ -31,7 +31,7 @@ reax/c"_pair_reax_c.html in the exact same format as the original
 stand-alone ReaxFF code of Adri van Duin.  The bond information is
 written to {filename} on timesteps that are multiples of {Nevery},
 including timestep 0.  For time-averaged chemical species analysis,
-please see the "fix reaxc/c/species"_fix_reaxc_species.html command.
+please see the "fix species"_fix_species.html command.
 
 The format of the output file should be self-explantory.
 
diff --git a/doc/src/fix_rx.txt b/doc/src/fix_rx.txt
index 97ffd89c65..8180294a91 100644
--- a/doc/src/fix_rx.txt
+++ b/doc/src/fix_rx.txt
@@ -198,6 +198,6 @@ enthalpy DPD simulation.
 
 "fix eos/table/rx"_fix_eos_table_rx.html,
 "fix shardlow"_fix_shardlow.html,
-"pair dpd/fdt/energy"_pair_dpd_fdt.html
+"pair dpd/fdt/energy"_dpd_fdt_energy.html
 
 [Default:] none
diff --git a/doc/src/fix_shardlow.txt b/doc/src/fix_shardlow.txt
index 21de9a9e0b..485f0a4b6f 100644
--- a/doc/src/fix_shardlow.txt
+++ b/doc/src/fix_shardlow.txt
@@ -26,7 +26,7 @@ integrate the DPD equations of motion.  The SSA splits the integration
 into a stochastic and deterministic integration step.  The fix
 {shardlow} performs the stochastic integration step and must be used
 in conjunction with a deterministic integrator (e.g. "fix
-nve"_fix_nve.html or "fix nph"_fix_nh.html).  The stochastic
+nve"_fix_nve.html or "fix nph"_fix_nph.html).  The stochastic
 integration of the dissipative and random forces is performed prior to
 the deterministic integration of the conservative force. Further
 details regarding the method are provided in "(Lisal)"_#Lisal and
diff --git a/doc/src/fix_smd.txt b/doc/src/fix_smd.txt
index aa23cc3fc3..01ec79e71e 100644
--- a/doc/src/fix_smd.txt
+++ b/doc/src/fix_smd.txt
@@ -138,14 +138,14 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 
 :line
 
-:link(Izrailev)
+:link(Israilev)
 [(Izrailev)] Izrailev, Stepaniants, Isralewitz, Kosztin, Lu, Molnar,
 Wriggers, Schulten. Computational Molecular Dynamics: Challenges,
 Methods, Ideas, volume 4 of Lecture Notes in Computational Science and
 Engineering, pp. 39-65. Springer-Verlag, Berlin, 1998.
 
-:link(Park)
-[(Park)] Park, Schulten, J. Chem. Phys. 120 (13), 5946 (2004)
+[(Park)]
+Park, Schulten, J. Chem. Phys. 120 (13), 5946 (2004)
 
-:link(Jarzynski)
-[(Jarzynski)] Jarzynski, Phys. Rev. Lett. 78, 2690 (1997)
+[(Jarzynski)]
+Jarzynski, Phys. Rev. Lett. 78, 2690 (1997)
diff --git a/doc/src/fix_wall_piston.txt b/doc/src/fix_wall_piston.txt
index 5595c0195f..347d1edc58 100644
--- a/doc/src/fix_wall_piston.txt
+++ b/doc/src/fix_wall_piston.txt
@@ -92,7 +92,7 @@ No information about this fix is written to "binary restart
 files"_restart.html.  None of the "fix_modify"_fix_modify.html options
 are relevant to this fix.  No global or per-atom quantities are stored
 by this fix for access by various "output
-commands"_Section_howto.html#howto_15.  No parameter of this fix can
+commands"_Section_howto.html#howoto_15.  No parameter of this fix can
 be used with the {start/stop} keywords of the "run"_run.html command.
 This fix is not invoked during "energy minimization"_minimize.html.
 
diff --git a/doc/src/kspace_style.txt b/doc/src/kspace_style.txt
index f9f27f6e9c..bbe449d9d2 100644
--- a/doc/src/kspace_style.txt
+++ b/doc/src/kspace_style.txt
@@ -12,7 +12,7 @@ kspace_style command :h3
 
 kspace_style style value :pre
 
-style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg} or {pppm/disp} or {pppm/tip4p} or {pppm/stagger} or {pppm/disp/tip4p} or {pppm/gpu} or {pppm/omp} or {pppm/cg/omp} or {pppm/tip4p/omp} or {msm} or {msm/cg} or {msm/omp} or {msm/cg/omp} :ulb,l
+style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg} or {pppm/disp} or {pppm/tip4p} or {pppm/stagger} or {pppm/disp/tip4p} or {pppm/gpu} or {pppm/kk} or {pppm/omp} or {pppm/cg/omp} or {pppm/tip4p/omp} or {msm} or {msm/cg} or {msm/omp} or {msm/cg/omp} :ulb,l
   {none} value = none
   {ewald} value = accuracy
     accuracy = desired relative error in forces
@@ -33,6 +33,8 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
     accuracy = desired relative error in forces
   {pppm/gpu} value = accuracy
     accuracy = desired relative error in forces
+  {pppm/kk} value = accuracy
+    accuracy = desired relative error in forces
   {pppm/omp} value = accuracy
     accuracy = desired relative error in forces
   {pppm/cg/omp} value = accuracy
@@ -74,7 +76,7 @@ interacts with charges in an infinite array of periodic images of the
 simulation domain.
 
 Note that using a long-range solver requires use of a matching "pair
-style"_pair_style.html to perform consistent short-range pairwise
+style"_pair.html to perform consistent short-range pairwise
 calculations.  This means that the name of the pair style contains a
 matching keyword to the name of the KSpace style, as in this table:
 
@@ -271,6 +273,10 @@ calculation can be performed concurrently on the GPU while other
 calculations for non-bonded and bonded force calculation are performed
 on the CPU.
 
+The {pppm/kk} style also performs charge assignment and force
+interpolation calculations on the GPU while the FFTs themselves are
+calculated on the CPU in non-threaded mode.
+
 These accelerated styles are part of the GPU, USER-INTEL,
 KOKKOS, USER-OMP, and OPT packages respectively.  They are only
 enabled if LAMMPS was built with those packages.  See the "Making
@@ -299,10 +305,10 @@ For MSM, a simulation must be 3d and one can use any combination of
 periodic, non-periodic, or shrink-wrapped boundaries (specified using 
 the "boundary"_boundary.html command). 
 
-For Ewald and PPPM, a simulation must be 3d and periodic in all dimensions.  
-The only exception is if the slab option is set with "kspace_modify"_kspace_modify.html, 
-in which case the xy dimensions must be periodic and the z dimension must be 
-non-periodic.
+For Ewald and PPPM, a simulation must be 3d and periodic in all
+dimensions.  The only exception is if the slab option is set with
+"kspace_modify"_kspace_modify.html, in which case the xy dimensions
+must be periodic and the z dimension must be non-periodic.
 
 [Related commands:]
 
@@ -354,11 +360,12 @@ and Computation 5, 2322 (2009)
 10913 (2000).
 
 :link(Isele-Holder)
-[(Isele-Holder)] Isele-Holder, Mitchell, Ismail, J Chem Phys, 137, 174107 (2012). 
+[(Isele-Holder)] Isele-Holder, Mitchell, Ismail, J Chem Phys, 137,
+174107 (2012).
 
 :link(Isele-Holder2)
-[(Isele-Holder2)] Isele-Holder, Mitchell, Hammond, Kohlmeyer, Ismail, J Chem Theory
-Comput 9, 5412 (2013).
+[(Isele-Holder2)] Isele-Holder, Mitchell, Hammond, Kohlmeyer, Ismail,
+J Chem Theory Comput 9, 5412 (2013).
 
 :link(Hardy)
 [(Hardy)] David Hardy thesis: Multilevel Summation for the Fast
diff --git a/doc/src/molecule.txt b/doc/src/molecule.txt
index 7b8c14fdad..05971b5ddc 100644
--- a/doc/src/molecule.txt
+++ b/doc/src/molecule.txt
@@ -184,7 +184,7 @@ NOTE: Whether a section is required depends on how the molecule
 template is used by other LAMMPS commands.  For example, to add a
 molecule via the "fix deposit"_fix_deposit.html command, the Coords
 and Types sections are required.  To add a rigid body via the "fix
-pour"_fix_pour.html command, the Bonds (Angles, etc) sections are not
+pour"_fix_pout.html command, the Bonds (Angles, etc) sections are not
 required, since the molecule will be treated as a rigid body.  Some
 sections are optional.  For example, the "fix pour"_fix_pour.html
 command can be used to add "molecules" which are clusters of
diff --git a/doc/src/neighbor.txt b/doc/src/neighbor.txt
index 7b8f499ba8..b73b893cb2 100644
--- a/doc/src/neighbor.txt
+++ b/doc/src/neighbor.txt
@@ -73,7 +73,7 @@ section"_Section_start.html#start_8 for details.
 [Related commands:]
 
 "neigh_modify"_neigh_modify.html, "units"_units.html,
-"comm_modify"_comm_modify.html
+"comm_modify"_cmom_modify.html
 
 [Default:]
 
diff --git a/doc/src/pair_brownian.txt b/doc/src/pair_brownian.txt
index 33eed77629..2bbff42b9b 100644
--- a/doc/src/pair_brownian.txt
+++ b/doc/src/pair_brownian.txt
@@ -123,7 +123,7 @@ This pair style can only be used via the {pair} keyword of the
 
 These styles are part of the COLLOID package.  They are only enabled
 if LAMMPS was built with that package.  See the "Making
-LAMMPS"_Section_start.html#start_3 section for more info.
+LAMMPS"_Section_start.html#2_3 section for more info.
 
 Only spherical monodisperse particles are allowed for pair_style
 brownian.
diff --git a/doc/src/pair_dipole.txt b/doc/src/pair_dipole.txt
index be6608d57b..f8f5dbf87f 100755
--- a/doc/src/pair_dipole.txt
+++ b/doc/src/pair_dipole.txt
@@ -138,18 +138,19 @@ dipole interactions.  The long-range portion is calculated by using
 computed at all.
 
 Atoms with dipole moments should be integrated using the "fix
-nve/sphere update dipole"_fix_nve_sphere.html or the "fix
-nvt/sphere update dipole"_fix_nvt_sphere.html command to rotate the
+nve/sphere update dipole"_fix_nve_sphere.html command to rotate the
 dipole moments.  The {omega} option on the "fix
 langevin"_fix_langevin.html command can be used to thermostat the
 rotational motion.  The "compute temp/sphere"_compute_temp_sphere.html
 command can be used to monitor the temperature, since it includes
 rotational degrees of freedom.  The "atom_style
-hybrid dipole sphere"_atom_style.html command should be used since
-it defines the point dipoles and their rotational state.
-The magnitude and orientation of the dipole moment for each particle
-can be defined by the "set"_set.html command or in the "Atoms" section
-of the data file read in by the "read_data"_read_data.html command.
+dipole"_atom_style.html command should be used since it defines the
+point dipoles and their rotational state.  The magnitude of the dipole
+moment for each type of particle can be defined by the
+"dipole"_dipole.html command or in the "Dipoles" section of the data
+file read in by the "read_data"_read_data.html command.  Their initial
+orientation can be defined by the "set dipole"_set.html command or in
+the "Atoms" section of the data file.
 
 The following coefficients must be defined for each pair of atoms
 types via the "pair_coeff"_pair_coeff.html command as in the examples
@@ -241,8 +242,7 @@ currently supported.
 
 [Related commands:]
 
-"pair_coeff"_pair_coeff.html, "set"_set.html, "read_data"_read_data.html,
-"fix nve/sphere"_fix_nve_sphere.html, "fix nvt/sphere"_fix_nvt_sphere.html
+"pair_coeff"_pair_coeff.html
 
 [Default:] none
 
diff --git a/doc/src/pair_dpd_fdt.txt b/doc/src/pair_dpd_fdt.txt
index ca1baeabe2..07d5f3d5bb 100644
--- a/doc/src/pair_dpd_fdt.txt
+++ b/doc/src/pair_dpd_fdt.txt
@@ -115,7 +115,7 @@ enabled if LAMMPS was built with that package.  See the "Making
 LAMMPS"_Section_start.html#start_3 section for more info.
 
 Pair styles {dpd/fdt} and {dpd/fdt/energy} require use of the
-"comm_modify vel yes"_comm_modify.html option so that velocites are
+"communicate vel yes"_communicate.html option so that velocites are
 stored by ghost atoms.
 
 Pair style {dpd/fdt/energy} requires "atom_style dpd"_atom_style.html
diff --git a/doc/src/pair_gayberne.txt b/doc/src/pair_gayberne.txt
index 037e5c47e5..109f578549 100755
--- a/doc/src/pair_gayberne.txt
+++ b/doc/src/pair_gayberne.txt
@@ -191,7 +191,7 @@ LAMMPS"_Section_start.html#start_3 section for more info.
 These pair style require that atoms store torque and a quaternion to
 represent their orientation, as defined by the
 "atom_style"_atom_style.html.  It also require they store a per-type
-"shape"_set.html.  The particles cannot store a per-particle
+"shape"_shape.html.  The particles cannot store a per-particle
 diameter.
 
 This pair style requires that atoms be ellipsoids as defined by the
diff --git a/doc/src/pair_line_lj.txt b/doc/src/pair_line_lj.txt
index 85bb3664fa..3e91e97c2d 100644
--- a/doc/src/pair_line_lj.txt
+++ b/doc/src/pair_line_lj.txt
@@ -131,7 +131,7 @@ This pair style can only be used via the {pair} keyword of the
 
 This style is part of the ASPHERE package.  It is only enabled if
 LAMMPS was built with that package.  See the "Making
-LAMMPS"_Section_start.html#start_3 section for more info.
+LAMMPS"_Section_start.html#2_3 section for more info.
 
 Defining particles to be line segments so they participate in
 line/line or line/particle interactions requires the use the
diff --git a/doc/src/pair_thole.txt b/doc/src/pair_thole.txt
index ab3b832ad6..af1dffc1a6 100644
--- a/doc/src/pair_thole.txt
+++ b/doc/src/pair_thole.txt
@@ -60,7 +60,7 @@ it also allows for mixing pair coefficients instead of listing them all.
 The {lj/cut/thole/long} pair style is also a bit faster because it avoids an
 overlay and can benefit from OMP acceleration. Moreover, it uses a more
 precise approximation of the direct Coulomb interaction at short range similar
-to "coul/long/cs"_pair_cs.html, which stabilizes the temperature of
+to "coul/long/cs"_pair_coul_long_cs.html, which stabilizes the temperature of
 Drude particles.
 
 The {thole} pair styles compute the Coulomb interaction damped at
diff --git a/doc/src/pair_tri_lj.txt b/doc/src/pair_tri_lj.txt
index e4eed927f5..cfc64c52fd 100644
--- a/doc/src/pair_tri_lj.txt
+++ b/doc/src/pair_tri_lj.txt
@@ -102,7 +102,7 @@ This pair style can only be used via the {pair} keyword of the
 
 This style is part of the ASPHERE package.  It is only enabled if
 LAMMPS was built with that package.  See the "Making
-LAMMPS"_Section_start.html#start_3 section for more info.
+LAMMPS"_Section_start.html#2_3 section for more info.
 
 Defining particles to be triangles so they participate in tri/tri or
 tri/particle interactions requires the use the "atom_style
diff --git a/doc/src/python.txt b/doc/src/python.txt
index 2ec2fdcfca..2c1e6c0b30 100644
--- a/doc/src/python.txt
+++ b/doc/src/python.txt
@@ -460,7 +460,7 @@ your Python function must be able to to load the Python module in
 python/lammps.py that wraps the LAMMPS library interface.  These are
 the same steps required to use Python by itself to wrap LAMMPS.
 Details on these steps are explained in "Section
-python"_Section_python.html.  Note that it is important that the
+python"_Section.python.html.  Note that it is important that the
 stand-alone LAMMPS executable and the LAMMPS shared library be
 consistent (built from the same source code files) in order for this
 to work.  If the two have been built at different times using
diff --git a/doc/src/rerun.txt b/doc/src/rerun.txt
index c623d4bebc..bd331f2d7e 100644
--- a/doc/src/rerun.txt
+++ b/doc/src/rerun.txt
@@ -53,7 +53,7 @@ Compute the energy and forces of snaphots using a different potential.
 Calculate one or more diagnostic quantities on the snapshots that
 weren't computed in the initial run.  These can also be computed with
 settings not used in the initial run, e.g. computing an RDF via the
-"compute rdf"_compute_rdf.html command with a longer cutoff than was
+"compute rdf"_compute.rdf.html command with a longer cutoff than was
 used initially. :l
 
 Calculate the portion of per-atom forces resulting from a subset of
diff --git a/doc/src/variable.txt b/doc/src/variable.txt
index 9f40beacb4..b928160d57 100644
--- a/doc/src/variable.txt
+++ b/doc/src/variable.txt
@@ -1108,7 +1108,7 @@ with a leading $ sign (e.g. $x or $\{abc\}) versus with a leading "v_"
 command, including a variable command.  The input script parser
 evaluates the reference variable immediately and substitutes its value
 into the command.  As explained in "Section commands
-3.2"_Section_commands.html#cmd_2 for "Parsing rules", you can also use
+3.2"_Section_commands.html#3_2 for "Parsing rules", you can also use
 un-named "immediate" variables for this purpose.  For example, a
 string like this $((xlo+xhi)/2+sqrt(v_area)) in an input script
 command evaluates the string between the parenthesis as an equal-style
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index c269de41d3..c45be8c973 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -87,6 +87,8 @@ action fix_setforce_kokkos.cpp
 action fix_setforce_kokkos.h
 action fix_wall_reflect_kokkos.cpp
 action fix_wall_reflect_kokkos.h
+action gridcomm_kokkos.cpp gridcomm.cpp
+action gridcomm_kokkos.h gridcomm.h
 action improper_harmonic_kokkos.cpp improper_harmonic.cpp
 action improper_harmonic_kokkos.h improper_harmonic.h
 action kokkos.cpp
@@ -102,6 +104,8 @@ action neigh_list_kokkos.cpp
 action neigh_list_kokkos.h
 action neighbor_kokkos.cpp
 action neighbor_kokkos.h
+action math_special_kokkos.cpp
+action math_special_kokkos.h
 action pair_buck_coul_cut_kokkos.cpp
 action pair_buck_coul_cut_kokkos.h
 action pair_buck_coul_long_kokkos.cpp pair_buck_coul_long.cpp
@@ -167,6 +171,8 @@ action pair_tersoff_mod_kokkos.cpp pair_tersoff_mod.cpp
 action pair_tersoff_mod_kokkos.h pair_tersoff_mod.h
 action pair_tersoff_zbl_kokkos.cpp pair_tersoff_zbl.cpp
 action pair_tersoff_zbl_kokkos.h pair_tersoff_zbl.h
+action pppm_kokkos.cpp pppm.cpp
+action pppm_kokkos.h pppm.h
 action region_block_kokkos.cpp
 action region_block_kokkos.h
 action verlet_kokkos.cpp
diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp
new file mode 100644
index 0000000000..6871ef67ae
--- /dev/null
+++ b/src/KOKKOS/gridcomm_kokkos.cpp
@@ -0,0 +1,614 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include "gridcomm_kokkos.h"
+#include "comm.h"
+#include "kspace.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define SWAPDELTA 8
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
+                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
+  : Pointers(lmp)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+
+  nforward = forward;
+  nreverse = reverse;
+
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  outxlo_max = oxlo;
+  outxhi_max = oxhi;
+  outylo_max = oylo;
+  outyhi_max = oyhi;
+  outzlo_max = ozlo;
+  outzhi_max = ozhi;
+
+  procxlo = pxlo;
+  procxhi = pxhi;
+  procylo = pylo;
+  procyhi = pyhi;
+  proczlo = pzlo;
+  proczhi = pzhi;
+
+  nswap = 0;
+  swap = NULL;
+  //buf1 = buf2 = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+GridCommKokkos<DeviceType>::GridCommKokkos(LAMMPS *lmp, MPI_Comm gcomm, int forward, int reverse,
+                   int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi,
+                   int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi,
+                   int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max,
+                   int ozlo_max, int ozhi_max,
+                   int pxlo, int pxhi, int pylo, int pyhi, int pzlo, int pzhi)
+  : Pointers(lmp)
+{
+  gridcomm = gcomm;
+  MPI_Comm_rank(gridcomm,&me);
+
+  nforward = forward;
+  nreverse = reverse;
+
+  inxlo = ixlo;
+  inxhi = ixhi;
+  inylo = iylo;
+  inyhi = iyhi;
+  inzlo = izlo;
+  inzhi = izhi;
+
+  outxlo = oxlo;
+  outxhi = oxhi;
+  outylo = oylo;
+  outyhi = oyhi;
+  outzlo = ozlo;
+  outzhi = ozhi;
+
+  outxlo_max = oxlo_max;
+  outxhi_max = oxhi_max;
+  outylo_max = oylo_max;
+  outyhi_max = oyhi_max;
+  outzlo_max = ozlo_max;
+  outzhi_max = ozhi_max;
+
+  procxlo = pxlo;
+  procxhi = pxhi;
+  procylo = pylo;
+  procyhi = pyhi;
+  proczlo = pzlo;
+  proczhi = pzhi;
+
+  nswap = 0;
+  swap = NULL;
+  //buf1 = buf2 = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+GridCommKokkos<DeviceType>::~GridCommKokkos()
+{
+  for (int i = 0; i < nswap; i++) {
+    //memory->destroy_kokkos(swap[i].k_packlist,swap[i].packlist);
+    //memory->destroy_kokkos(swap[i].k_unpacklist,swap[i].unpacklist);
+  }
+  memory->sfree(swap);
+
+  //memory->destroy(buf1);
+  //memory->destroy(buf2);
+}
+
+/* ----------------------------------------------------------------------
+   notify 6 neighbor procs how many ghost grid planes I need from them
+   ghostxlo = # of lower grid planes I own that are needed from me
+              by procxlo to become its upper ghost planes
+   ghostxhi = # of upper grid planes I own that are needed from me
+              by procxhi to become its lower ghost planes
+   if no neighbor proc, value is from self
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::ghost_notify()
+{
+  int nplanes = inxlo - outxlo;
+  if (procxlo != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0,
+                   &ghostxhi,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostxhi = nplanes;
+
+  nplanes = outxhi - inxhi;
+  if (procxhi != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,procxhi,0,
+                   &ghostxlo,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostxlo = nplanes;
+
+  nplanes = inylo - outylo;
+  if (procylo != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,procylo,0,
+                 &ghostyhi,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostyhi = nplanes;
+
+  nplanes = outyhi - inyhi;
+  if (procyhi != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,procyhi,0,
+                 &ghostylo,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostylo = nplanes;
+
+  nplanes = inzlo - outzlo;
+  if (proczlo != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,proczlo,0,
+                 &ghostzhi,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostzhi = nplanes;
+
+  nplanes = outzhi - inzhi;
+  if (proczhi != me)
+    MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0,
+                 &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
+  else ghostzlo = nplanes;
+}
+
+/* ----------------------------------------------------------------------
+   check if all ghost grid comm needs overlap into non nearest-neighbor proc
+   if yes, return 1, else return 0
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int GridCommKokkos<DeviceType>::ghost_overlap()
+{
+  int nearest = 0;
+  if (ghostxlo > inxhi-inxlo+1) nearest = 1;
+  if (ghostxhi > inxhi-inxlo+1) nearest = 1;
+  if (ghostylo > inyhi-inylo+1) nearest = 1;
+  if (ghostyhi > inyhi-inylo+1) nearest = 1;
+  if (ghostzlo > inzhi-inzlo+1) nearest = 1;
+  if (ghostzhi > inzhi-inzlo+1) nearest = 1;
+
+  int nearest_all;
+  MPI_Allreduce(&nearest,&nearest_all,1,MPI_INT,MPI_MIN,gridcomm);
+
+  return nearest_all;
+}
+
+/* ----------------------------------------------------------------------
+   create swap stencil for grid own/ghost communication
+   swaps covers all 3 dimensions and both directions
+   swaps cover multiple iterations in a direction if need grid pts
+     from further away than nearest-neighbor proc
+   same swap list used by forward and reverse communication
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::setup()
+{
+  int nsent,sendfirst,sendlast,recvfirst,recvlast;
+  int sendplanes,recvplanes;
+  int notdoneme,notdone;
+
+  int maxswap = 6;
+  swap = (Swap *) memory->smalloc(maxswap*sizeof(Swap),"Commgrid:swap");
+  k_packlist = DAT::tdual_int_2d("Commgrid:packlist",maxswap,1);
+  k_unpacklist = DAT::tdual_int_2d("Commgrid:unpacklist",maxswap,1);
+  nswap = 0;
+
+  // send own grid pts to -x processor, recv ghost grid pts from +x processor
+
+  nsent = 0;
+  sendfirst = inxlo;
+  sendlast = inxhi;
+  recvfirst = inxhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) {
+      maxswap += SWAPDELTA;
+      swap = (Swap *)
+        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
+      k_packlist.resize(maxswap,k_packlist.dimension_1());
+      k_unpacklist.resize(maxswap,k_unpacklist.dimension_1());
+    }
+
+    swap[nswap].sendproc = procxlo;
+    swap[nswap].recvproc = procxhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent);
+    swap[nswap].npack =
+      indices(k_packlist,nswap,
+              sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi);
+
+    if (procxlo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxlo,0,
+                   &recvplanes,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(k_unpacklist,nswap,
+              recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostxlo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +x processor, recv ghost grid pts from -x processor
+
+  nsent = 0;
+  sendfirst = inxlo;
+  sendlast = inxhi;
+  recvlast = inxlo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) {
+      maxswap += 1;
+      swap = (Swap *)
+        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
+      k_packlist.resize(maxswap,k_packlist.dimension_1());
+      k_unpacklist.resize(maxswap,k_unpacklist.dimension_1());
+    }
+
+    swap[nswap].sendproc = procxhi;
+    swap[nswap].recvproc = procxlo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent);
+    swap[nswap].npack =
+      indices(k_packlist,nswap,
+              sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi);
+
+    if (procxhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procxhi,0,
+                   &recvplanes,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(k_unpacklist,nswap,
+              recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostxhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to -y processor, recv ghost grid pts from +y processor
+
+  nsent = 0;
+  sendfirst = inylo;
+  sendlast = inyhi;
+  recvfirst = inyhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) {
+      maxswap += SWAPDELTA;
+      swap = (Swap *)
+        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
+      k_packlist.resize(maxswap,k_packlist.dimension_1());
+      k_unpacklist.resize(maxswap,k_unpacklist.dimension_1());
+    }
+
+    swap[nswap].sendproc = procylo;
+    swap[nswap].recvproc = procyhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent);
+    swap[nswap].npack =
+      indices(k_packlist,nswap,
+              outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi);
+
+    if (procylo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procylo,0,
+                   &recvplanes,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(k_unpacklist,nswap,
+              outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostylo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +y processor, recv ghost grid pts from -y processor
+
+  nsent = 0;
+  sendfirst = inylo;
+  sendlast = inyhi;
+  recvlast = inylo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) {
+      maxswap += 1;
+      swap = (Swap *)
+        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
+      k_packlist.resize(maxswap,k_packlist.dimension_1());
+      k_unpacklist.resize(maxswap,k_unpacklist.dimension_1());
+    }
+
+    swap[nswap].sendproc = procyhi;
+    swap[nswap].recvproc = procylo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent);
+    swap[nswap].npack =
+      indices(k_packlist,nswap,
+              outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi);
+
+    if (procyhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,procyhi,0,
+                   &recvplanes,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(k_unpacklist,nswap,
+              outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostyhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to -z processor, recv ghost grid pts from +z processor
+
+  nsent = 0;
+  sendfirst = inzlo;
+  sendlast = inzhi;
+  recvfirst = inzhi+1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) {
+      maxswap += SWAPDELTA;
+      swap = (Swap *)
+        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
+      k_packlist.resize(maxswap,k_packlist.dimension_1());
+      k_unpacklist.resize(maxswap,k_unpacklist.dimension_1());
+    }
+
+    swap[nswap].sendproc = proczlo;
+    swap[nswap].recvproc = proczhi;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent);
+    swap[nswap].npack =
+      indices(k_packlist,nswap,
+              outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1);
+
+    if (proczlo != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczlo,0,
+                   &recvplanes,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(k_unpacklist,nswap,
+              outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1);
+
+    nsent += sendplanes;
+    sendfirst += sendplanes;
+    sendlast += recvplanes;
+    recvfirst += recvplanes;
+    nswap++;
+
+    if (nsent < ghostzlo) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // send own grid pts to +z processor, recv ghost grid pts from -z processor
+
+  nsent = 0;
+  sendfirst = inzlo;
+  sendlast = inzhi;
+  recvlast = inzlo-1;
+  notdone = 1;
+
+  while (notdone) {
+    if (nswap == maxswap) {
+      maxswap += 1;
+      swap = (Swap *)
+        memory->srealloc(swap,maxswap*sizeof(Swap),"Commgrid:swap");
+      k_packlist.resize(maxswap,k_packlist.dimension_1());
+      k_unpacklist.resize(maxswap,k_unpacklist.dimension_1());
+    }
+
+    swap[nswap].sendproc = proczhi;
+    swap[nswap].recvproc = proczlo;
+    sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent);
+    swap[nswap].npack =
+      indices(k_packlist,nswap,
+              outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast);
+
+    if (proczhi != me)
+      MPI_Sendrecv(&sendplanes,1,MPI_INT,proczhi,0,
+                   &recvplanes,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE);
+    else recvplanes = sendplanes;
+
+    swap[nswap].nunpack =
+      indices(k_unpacklist,nswap,
+              outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast);
+
+    nsent += sendplanes;
+    sendfirst -= recvplanes;
+    sendlast -= sendplanes;
+    recvlast -= recvplanes;
+    nswap++;
+
+    if (nsent < ghostzhi) notdoneme = 1;
+    else notdoneme = 0;
+    MPI_Allreduce(&notdoneme,&notdone,1,MPI_INT,MPI_SUM,gridcomm);
+  }
+
+  // nbuf = max of any forward/reverse pack/unpack
+
+  nbuf = 0;
+  for (int i = 0; i < nswap; i++) {
+    nbuf = MAX(nbuf,swap[i].npack);
+    nbuf = MAX(nbuf,swap[i].nunpack);
+  }
+  nbuf *= MAX(nforward,nreverse);
+  //memory->create(buf1,nbuf,"Commgrid:buf1");
+  k_buf1 = DAT::tdual_FFT_SCALAR_1d("Commgrid:buf1",nbuf);
+  //memory->create(buf2,nbuf,"Commgrid:buf2");
+  k_buf2 = DAT::tdual_FFT_SCALAR_1d("Commgrid:buf2",nbuf);
+}
+
+/* ----------------------------------------------------------------------
+   use swap list in forward order to acquire copy of all needed ghost grid pts
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::forward_comm(KSpace *kspace, int which)
+{
+  k_packlist.sync<DeviceType>();
+  k_unpacklist.sync<DeviceType>();
+
+  for (int m = 0; m < nswap; m++) {
+    if (swap[m].sendproc == me)
+      kspace->pack_forward_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);
+    else
+      kspace->pack_forward_kokkos(which,k_buf1,swap[m].npack,k_packlist,m);
+
+    if (swap[m].sendproc != me) {
+      MPI_Irecv(k_buf2.view<DeviceType>().ptr_on_device(),nforward*swap[m].nunpack,MPI_FFT_SCALAR,
+                swap[m].recvproc,0,gridcomm,&request);
+      MPI_Send(k_buf1.view<DeviceType>().ptr_on_device(),nforward*swap[m].npack,MPI_FFT_SCALAR,
+               swap[m].sendproc,0,gridcomm);
+      MPI_Wait(&request,MPI_STATUS_IGNORE);
+    }
+
+    kspace->unpack_forward_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   use swap list in reverse order to compute fully summed value
+   for each owned grid pt that some other proc has copy of as a ghost grid pt
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void GridCommKokkos<DeviceType>::reverse_comm(KSpace *kspace, int which)
+{
+  k_packlist.sync<DeviceType>();
+  k_unpacklist.sync<DeviceType>();
+
+  for (int m = nswap-1; m >= 0; m--) {
+    if (swap[m].recvproc == me)
+      kspace->pack_reverse_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m);
+    else
+      kspace->pack_reverse_kokkos(which,k_buf1,swap[m].nunpack,k_unpacklist,m);
+
+    if (swap[m].recvproc != me) {
+      MPI_Irecv(k_buf2.view<DeviceType>().ptr_on_device(),nreverse*swap[m].npack,MPI_FFT_SCALAR,
+                swap[m].sendproc,0,gridcomm,&request);
+      MPI_Send(k_buf1.view<DeviceType>().ptr_on_device(),nreverse*swap[m].nunpack,MPI_FFT_SCALAR,
+               swap[m].recvproc,0,gridcomm);
+      MPI_Wait(&request,MPI_STATUS_IGNORE);
+    }
+
+    kspace->unpack_reverse_kokkos(which,k_buf2,swap[m].npack,k_packlist,m);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi)
+   assume 3d array is allocated as (0:outxhi_max-outxlo_max+1,0:outyhi_max-outylo_max+1,
+     0:outzhi_max-outzlo_max+1)
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int GridCommKokkos<DeviceType>::indices(DAT::tdual_int_2d &k_list, int index,
+                       int xlo, int xhi, int ylo, int yhi, int zlo, int zhi)
+{
+  int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1);
+  if (k_list.dimension_1() < nmax)
+    k_list.resize(k_list.dimension_0(),nmax);
+
+  int nx = (outxhi_max-outxlo_max+1);
+  int ny = (outyhi_max-outylo_max+1);
+
+  k_list.sync<LMPHostType>();
+
+  int n = 0;
+  int ix,iy,iz;
+  for (iz = zlo; iz <= zhi; iz++)
+    for (iy = ylo; iy <= yhi; iy++)
+      for (ix = xlo; ix <= xhi; ix++)
+        k_list.h_view(index,n++) = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max);
+
+  k_list.modify<LMPHostType>();
+  k_list.sync<DeviceType>();
+
+  return nmax;
+}
+
+
+/* ----------------------------------------------------------------------
+   memory usage of send/recv bufs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double GridCommKokkos<DeviceType>::memory_usage()
+{
+  double bytes = 2*nbuf * sizeof(double);
+  return bytes;
+}
+
+namespace LAMMPS_NS {
+template class GridCommKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class GridCommKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/gridcomm_kokkos.h b/src/KOKKOS/gridcomm_kokkos.h
new file mode 100644
index 0000000000..a8b3dfdc85
--- /dev/null
+++ b/src/KOKKOS/gridcomm_kokkos.h
@@ -0,0 +1,96 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_GRIDCOMM_KOKKOS_H
+#define LMP_GRIDCOMM_KOKKOS_H
+
+#include "pointers.h"
+#include "kokkos_type.h"
+
+#ifdef FFT_SINGLE
+typedef float FFT_SCALAR;
+#define MPI_FFT_SCALAR MPI_FLOAT
+#else
+typedef double FFT_SCALAR;
+#define MPI_FFT_SCALAR MPI_DOUBLE
+#endif
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class GridCommKokkos : protected Pointers {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+ 
+  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
+  GridCommKokkos(class LAMMPS *, MPI_Comm, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int,
+           int, int, int, int, int, int);
+  ~GridCommKokkos();
+  void ghost_notify();
+  int ghost_overlap();
+  void setup();
+  void forward_comm(class KSpace *, int);
+  void reverse_comm(class KSpace *, int);
+  double memory_usage();
+
+ private:
+  int me;
+  int nforward,nreverse;
+  MPI_Comm gridcomm;
+  MPI_Request request;
+
+  // in = inclusive indices of 3d grid chunk that I own
+  // out = inclusive indices of 3d grid chunk I own plus ghosts I use
+  // proc = 6 neighbor procs that surround me
+  // ghost = # of my owned grid planes needed from me
+  //         by each of 6 neighbor procs to become their ghost planes
+
+  int inxlo,inxhi,inylo,inyhi,inzlo,inzhi;
+  int outxlo,outxhi,outylo,outyhi,outzlo,outzhi;
+  int outxlo_max,outxhi_max,outylo_max,outyhi_max,outzlo_max,outzhi_max;
+  int procxlo,procxhi,procylo,procyhi,proczlo,proczhi;
+  int ghostxlo,ghostxhi,ghostylo,ghostyhi,ghostzlo,ghostzhi;
+
+  int nbuf;
+  //FFT_SCALAR *buf1,*buf2;
+  DAT::tdual_FFT_SCALAR_1d k_buf1;
+  DAT::tdual_FFT_SCALAR_1d k_buf2;
+
+  struct Swap {
+    int sendproc;       // proc to send to for forward comm
+    int recvproc;       // proc to recv from for forward comm
+    int npack;          // # of datums to pack
+    int nunpack;        // # of datums to unpack
+    //int *packlist;      // 3d array offsets to pack
+    //int *unpacklist;    // 3d array offsets to unpack
+  };
+
+  DAT::tdual_int_2d k_packlist;
+  DAT::tdual_int_2d k_unpacklist;
+
+  int nswap;
+  Swap *swap;
+
+  int indices(DAT::tdual_int_2d &, int, int, int, int, int, int, int);
+};
+
+}
+
+#endif
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index 07a6e20f01..c1176122a7 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -14,6 +14,8 @@
 #ifndef LMP_LMPTYPE_KOKKOS_H
 #define LMP_LMPTYPE_KOKKOS_H
 
+#include "lmptype.h"
+
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
 #include <impl/Kokkos_Timer.hpp>
@@ -24,6 +26,21 @@
 #define ISFINITE(x) std::isfinite(x)
 #endif
 
+// User-settable FFT precision
+
+// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag)
+// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag)
+
+#ifdef FFT_SINGLE
+#define FFT_PRECISION 1
+#define MPI_FFT_SCALAR MPI_FLOAT
+typedef float FFT_SCALAR;
+#else
+#define FFT_PRECISION 2
+#define MPI_FFT_SCALAR MPI_DOUBLE
+typedef double FFT_SCALAR;
+#endif
+
 #define MAX_TYPES_STACKPARAMS 12
 #define NeighClusterSize 8
 
@@ -567,6 +584,32 @@ typedef tdual_neighbors_2d::t_dev_um t_neighbors_2d_um;
 typedef tdual_neighbors_2d::t_dev_const_um t_neighbors_2d_const_um;
 typedef tdual_neighbors_2d::t_dev_const_randomread t_neighbors_2d_randomread;
 
+//Kspace
+
+typedef Kokkos::
+  DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d;
+typedef tdual_FFT_SCALAR_1d::t_dev t_FFT_SCALAR_1d;
+typedef tdual_FFT_SCALAR_1d::t_dev_um t_FFT_SCALAR_1d_um;
+
+typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d;
+typedef tdual_FFT_SCALAR_2d::t_dev t_FFT_SCALAR_2d;
+
+typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3;
+typedef tdual_FFT_SCALAR_2d_3::t_dev t_FFT_SCALAR_2d_3;
+
+typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d;
+typedef tdual_FFT_SCALAR_3d::t_dev t_FFT_SCALAR_3d;
+
+typedef Kokkos::
+  DualView<FFT_SCALAR*[2], Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d;
+typedef tdual_FFT_DATA_1d::t_dev t_FFT_DATA_1d;
+typedef tdual_FFT_DATA_1d::t_dev_um t_FFT_DATA_1d_um;
+
+typedef Kokkos::
+  DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64;
+typedef tdual_int_64::t_dev t_int_64;
+typedef tdual_int_64::t_dev_um t_int_64_um;
+
 };
 
 #ifdef KOKKOS_HAVE_CUDA
@@ -801,6 +844,33 @@ typedef tdual_neighbors_2d::t_host_um t_neighbors_2d_um;
 typedef tdual_neighbors_2d::t_host_const_um t_neighbors_2d_const_um;
 typedef tdual_neighbors_2d::t_host_const_randomread t_neighbors_2d_randomread;
 
+
+//Kspace
+
+typedef Kokkos::
+  DualView<FFT_SCALAR*, Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_SCALAR_1d;
+typedef tdual_FFT_SCALAR_1d::t_host t_FFT_SCALAR_1d;
+typedef tdual_FFT_SCALAR_1d::t_host_um t_FFT_SCALAR_1d_um;
+
+typedef Kokkos::DualView<FFT_SCALAR**,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d;
+typedef tdual_FFT_SCALAR_2d::t_host t_FFT_SCALAR_2d;
+
+typedef Kokkos::DualView<FFT_SCALAR**[3],Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_2d_3;
+typedef tdual_FFT_SCALAR_2d_3::t_host t_FFT_SCALAR_2d_3;
+
+typedef Kokkos::DualView<FFT_SCALAR***,Kokkos::LayoutRight,LMPDeviceType> tdual_FFT_SCALAR_3d;
+typedef tdual_FFT_SCALAR_3d::t_host t_FFT_SCALAR_3d;
+
+typedef Kokkos::
+  DualView<FFT_SCALAR*[2], Kokkos::LayoutRight, LMPDeviceType> tdual_FFT_DATA_1d;
+typedef tdual_FFT_DATA_1d::t_host t_FFT_DATA_1d;
+typedef tdual_FFT_DATA_1d::t_host_um t_FFT_DATA_1d_um;
+
+typedef Kokkos::
+  DualView<int*, LMPDeviceType::array_layout, LMPDeviceType> tdual_int_64;
+typedef tdual_int_64::t_host t_int_64;
+typedef tdual_int_64::t_host_um t_int_64_um;
+
 };
 #endif
 //default LAMMPS Types
diff --git a/src/KOKKOS/math_special_kokkos.cpp b/src/KOKKOS/math_special_kokkos.cpp
new file mode 100644
index 0000000000..a8c35b12b8
--- /dev/null
+++ b/src/KOKKOS/math_special_kokkos.cpp
@@ -0,0 +1,534 @@
+
+#include <math.h>
+#include <stdint.h>
+#include "math_special_kokkos.h"
+
+using namespace LAMMPS_NS;
+
+/* Library libcerf:
+ *   Compute complex error functions, based on a new implementation of
+ *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
+ *
+ * File erfcx.c:
+ *   Compute erfcx(x) = exp(x^2) erfc(x) function, for real x,
+ *   using a novel algorithm that is much faster than DERFC of SLATEC.
+ *   This function is used in the computation of Faddeeva, Dawson, and
+ *   other complex error functions.
+ *
+ * Copyright:
+ *   (C) 2012 Massachusetts Institute of Technology
+ *   (C) 2013 Forschungszentrum Jülich GmbH
+ *
+ * Licence:
+ *   Permission is hereby granted, free of charge, to any person obtaining
+ *   a copy of this software and associated documentation files (the
+ *   "Software"), to deal in the Software without restriction, including
+ *   without limitation the rights to use, copy, modify, merge, publish,
+ *   distribute, sublicense, and/or sell copies of the Software, and to
+ *   permit persons to whom the Software is furnished to do so, subject to
+ *   the following conditions:
+ *
+ *   The above copyright notice and this permission notice shall be
+ *   included in all copies or substantial portions of the Software.
+ *
+ *   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ *   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ *   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ *   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ *   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ *   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ *   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Steven G. Johnson, Massachusetts Institute of Technology, 2012, core author
+ *   Joachim Wuttke, Forschungszentrum Jülich, 2013, package maintainer
+ *
+ * Website:
+ *   http://apps.jcns.fz-juelich.de/libcerf
+ *
+ * Revision history:
+ *   ../CHANGELOG
+ *
+ * Manual page:
+ *   man 3 erfcx
+ */
+
+/******************************************************************************/
+/* Lookup-table for Chebyshev polynomials for smaller |x|                     */
+/******************************************************************************/
+
+double MathSpecialKokkos::erfcx_y100(const double y100)
+{
+    // Steven G. Johnson, October 2012.
+
+    // Given y100=100*y, where y = 4/(4+x) for x >= 0, compute erfc(x).
+
+    // Uses a look-up table of 100 different Chebyshev polynomials
+    // for y intervals [0,0.01], [0.01,0.02], ...., [0.99,1], generated
+    // with the help of Maple and a little shell script.  This allows
+    // the Chebyshev polynomials to be of significantly lower degree (about 1/4)
+    // compared to fitting the whole [0,1] interval with a single polynomial.
+
+    switch ((int) y100) {
+    case 0: {
+        double t = 2*y100 - 1;
+        return 0.70878032454106438663e-3 + (0.71234091047026302958e-3 + (0.35779077297597742384e-5 + (0.17403143962587937815e-7 + (0.81710660047307788845e-10 + (0.36885022360434957634e-12 + 0.15917038551111111111e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 1: {
+        double t = 2*y100 - 3;
+        return 0.21479143208285144230e-2 + (0.72686402367379996033e-3 + (0.36843175430938995552e-5 + (0.18071841272149201685e-7 + (0.85496449296040325555e-10 + (0.38852037518534291510e-12 + 0.16868473576888888889e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 2: {
+        double t = 2*y100 - 5;
+        return 0.36165255935630175090e-2 + (0.74182092323555510862e-3 + (0.37948319957528242260e-5 + (0.18771627021793087350e-7 + (0.89484715122415089123e-10 + (0.40935858517772440862e-12 + 0.17872061464888888889e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 3: {
+        double t = 2*y100 - 7;
+        return 0.51154983860031979264e-2 + (0.75722840734791660540e-3 + (0.39096425726735703941e-5 + (0.19504168704300468210e-7 + (0.93687503063178993915e-10 + (0.43143925959079664747e-12 + 0.18939926435555555556e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 4: {
+        double t = 2*y100 - 9;
+        return 0.66457513172673049824e-2 + (0.77310406054447454920e-3 + (0.40289510589399439385e-5 + (0.20271233238288381092e-7 + (0.98117631321709100264e-10 + (0.45484207406017752971e-12 + 0.20076352213333333333e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 5: {
+        double t = 2*y100 - 11;
+        return 0.82082389970241207883e-2 + (0.78946629611881710721e-3 + (0.41529701552622656574e-5 + (0.21074693344544655714e-7 + (0.10278874108587317989e-9 + (0.47965201390613339638e-12 + 0.21285907413333333333e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 6: {
+        double t = 2*y100 - 13;
+        return 0.98039537275352193165e-2 + (0.80633440108342840956e-3 + (0.42819241329736982942e-5 + (0.21916534346907168612e-7 + (0.10771535136565470914e-9 + (0.50595972623692822410e-12 + 0.22573462684444444444e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 7: {
+        double t = 2*y100 - 15;
+        return 0.11433927298290302370e-1 + (0.82372858383196561209e-3 + (0.44160495311765438816e-5 + (0.22798861426211986056e-7 + (0.11291291745879239736e-9 + (0.53386189365816880454e-12 + 0.23944209546666666667e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 8: {
+        double t = 2*y100 - 17;
+        return 0.13099232878814653979e-1 + (0.84167002467906968214e-3 + (0.45555958988457506002e-5 + (0.23723907357214175198e-7 + (0.11839789326602695603e-9 + (0.56346163067550237877e-12 + 0.25403679644444444444e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 9: {
+        double t = 2*y100 - 19;
+        return 0.14800987015587535621e-1 + (0.86018092946345943214e-3 + (0.47008265848816866105e-5 + (0.24694040760197315333e-7 + (0.12418779768752299093e-9 + (0.59486890370320261949e-12 + 0.26957764568888888889e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 10: {
+        double t = 2*y100 - 21;
+        return 0.16540351739394069380e-1 + (0.87928458641241463952e-3 + (0.48520195793001753903e-5 + (0.25711774900881709176e-7 + (0.13030128534230822419e-9 + (0.62820097586874779402e-12 + 0.28612737351111111111e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 11: {
+        double t = 2*y100 - 23;
+        return 0.18318536789842392647e-1 + (0.89900542647891721692e-3 + (0.50094684089553365810e-5 + (0.26779777074218070482e-7 + (0.13675822186304615566e-9 + (0.66358287745352705725e-12 + 0.30375273884444444444e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 12: {
+        double t = 2*y100 - 25;
+        return 0.20136801964214276775e-1 + (0.91936908737673676012e-3 + (0.51734830914104276820e-5 + (0.27900878609710432673e-7 + (0.14357976402809042257e-9 + (0.70114790311043728387e-12 + 0.32252476000000000000e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 13: {
+        double t = 2*y100 - 27;
+        return 0.21996459598282740954e-1 + (0.94040248155366777784e-3 + (0.53443911508041164739e-5 + (0.29078085538049374673e-7 + (0.15078844500329731137e-9 + (0.74103813647499204269e-12 + 0.34251892320000000000e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 14: {
+        double t = 2*y100 - 29;
+        return 0.23898877187226319502e-1 + (0.96213386835900177540e-3 + (0.55225386998049012752e-5 + (0.30314589961047687059e-7 + (0.15840826497296335264e-9 + (0.78340500472414454395e-12 + 0.36381553564444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 15: {
+        double t = 2*y100 - 31;
+        return 0.25845480155298518485e-1 + (0.98459293067820123389e-3 + (0.57082915920051843672e-5 + (0.31613782169164830118e-7 + (0.16646478745529630813e-9 + (0.82840985928785407942e-12 + 0.38649975768888888890e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 16: {
+        double t = 2*y100 - 33;
+        return 0.27837754783474696598e-1 + (0.10078108563256892757e-2 + (0.59020366493792212221e-5 + (0.32979263553246520417e-7 + (0.17498524159268458073e-9 + (0.87622459124842525110e-12 + 0.41066206488888888890e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 17: {
+        double t = 2*y100 - 35;
+        return 0.29877251304899307550e-1 + (0.10318204245057349310e-2 + (0.61041829697162055093e-5 + (0.34414860359542720579e-7 + (0.18399863072934089607e-9 + (0.92703227366365046533e-12 + 0.43639844053333333334e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 18: {
+        double t = 2*y100 - 37;
+        return 0.31965587178596443475e-1 + (0.10566560976716574401e-2 + (0.63151633192414586770e-5 + (0.35924638339521924242e-7 + (0.19353584758781174038e-9 + (0.98102783859889264382e-12 + 0.46381060817777777779e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 19: {
+        double t = 2*y100 - 39;
+        return 0.34104450552588334840e-1 + (0.10823541191350532574e-2 + (0.65354356159553934436e-5 + (0.37512918348533521149e-7 + (0.20362979635817883229e-9 + (0.10384187833037282363e-11 + 0.49300625262222222221e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 20: {
+        double t = 2*y100 - 41;
+        return 0.36295603928292425716e-1 + (0.11089526167995268200e-2 + (0.67654845095518363577e-5 + (0.39184292949913591646e-7 + (0.21431552202133775150e-9 + (0.10994259106646731797e-11 + 0.52409949102222222221e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 21: {
+        double t = 2*y100 - 43;
+        return 0.38540888038840509795e-1 + (0.11364917134175420009e-2 + (0.70058230641246312003e-5 + (0.40943644083718586939e-7 + (0.22563034723692881631e-9 + (0.11642841011361992885e-11 + 0.55721092871111111110e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 22: {
+        double t = 2*y100 - 45;
+        return 0.40842225954785960651e-1 + (0.11650136437945673891e-2 + (0.72569945502343006619e-5 + (0.42796161861855042273e-7 + (0.23761401711005024162e-9 + (0.12332431172381557035e-11 + 0.59246802364444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 23: {
+        double t = 2*y100 - 47;
+        return 0.43201627431540222422e-1 + (0.11945628793917272199e-2 + (0.75195743532849206263e-5 + (0.44747364553960993492e-7 + (0.25030885216472953674e-9 + (0.13065684400300476484e-11 + 0.63000532853333333334e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 24: {
+        double t = 2*y100 - 49;
+        return 0.45621193513810471438e-1 + (0.12251862608067529503e-2 + (0.77941720055551920319e-5 + (0.46803119830954460212e-7 + (0.26375990983978426273e-9 + (0.13845421370977119765e-11 + 0.66996477404444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 25: {
+        double t = 2*y100 - 51;
+        return 0.48103121413299865517e-1 + (0.12569331386432195113e-2 + (0.80814333496367673980e-5 + (0.48969667335682018324e-7 + (0.27801515481905748484e-9 + (0.14674637611609884208e-11 + 0.71249589351111111110e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 26: {
+        double t = 2*y100 - 53;
+        return 0.50649709676983338501e-1 + (0.12898555233099055810e-2 + (0.83820428414568799654e-5 + (0.51253642652551838659e-7 + (0.29312563849675507232e-9 + (0.15556512782814827846e-11 + 0.75775607822222222221e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 27: {
+        double t = 2*y100 - 55;
+        return 0.53263363664388864181e-1 + (0.13240082443256975769e-2 + (0.86967260015007658418e-5 + (0.53662102750396795566e-7 + (0.30914568786634796807e-9 + (0.16494420240828493176e-11 + 0.80591079644444444445e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 28: {
+        double t = 2*y100 - 57;
+        return 0.55946601353500013794e-1 + (0.13594491197408190706e-2 + (0.90262520233016380987e-5 + (0.56202552975056695376e-7 + (0.32613310410503135996e-9 + (0.17491936862246367398e-11 + 0.85713381688888888890e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 29: {
+        double t = 2*y100 - 59;
+        return 0.58702059496154081813e-1 + (0.13962391363223647892e-2 + (0.93714365487312784270e-5 + (0.58882975670265286526e-7 + (0.34414937110591753387e-9 + (0.18552853109751857859e-11 + 0.91160736711111111110e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 30: {
+        double t = 2*y100 - 61;
+        return 0.61532500145144778048e-1 + (0.14344426411912015247e-2 + (0.97331446201016809696e-5 + (0.61711860507347175097e-7 + (0.36325987418295300221e-9 + (0.19681183310134518232e-11 + 0.96952238400000000000e-14 * t) * t) * t) * t) * t) * t;
+    }
+    case 31: {
+        double t = 2*y100 - 63;
+        return 0.64440817576653297993e-1 + (0.14741275456383131151e-2 + (0.10112293819576437838e-4 + (0.64698236605933246196e-7 + (0.38353412915303665586e-9 + (0.20881176114385120186e-11 + 0.10310784480000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 32: {
+        double t = 2*y100 - 65;
+        return 0.67430045633130393282e-1 + (0.15153655418916540370e-2 + (0.10509857606888328667e-4 + (0.67851706529363332855e-7 + (0.40504602194811140006e-9 + (0.22157325110542534469e-11 + 0.10964842115555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 33: {
+        double t = 2*y100 - 67;
+        return 0.70503365513338850709e-1 + (0.15582323336495709827e-2 + (0.10926868866865231089e-4 + (0.71182482239613507542e-7 + (0.42787405890153386710e-9 + (0.23514379522274416437e-11 + 0.11659571751111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 34: {
+        double t = 2*y100 - 69;
+        return 0.73664114037944596353e-1 + (0.16028078812438820413e-2 + (0.11364423678778207991e-4 + (0.74701423097423182009e-7 + (0.45210162777476488324e-9 + (0.24957355004088569134e-11 + 0.12397238257777777778e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 35: {
+        double t = 2*y100 - 71;
+        return 0.76915792420819562379e-1 + (0.16491766623447889354e-2 + (0.11823685320041302169e-4 + (0.78420075993781544386e-7 + (0.47781726956916478925e-9 + (0.26491544403815724749e-11 + 0.13180196462222222222e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 36: {
+        double t = 2*y100 - 73;
+        return 0.80262075578094612819e-1 + (0.16974279491709504117e-2 + (0.12305888517309891674e-4 + (0.82350717698979042290e-7 + (0.50511496109857113929e-9 + (0.28122528497626897696e-11 + 0.14010889635555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 37: {
+        double t = 2*y100 - 75;
+        return 0.83706822008980357446e-1 + (0.17476561032212656962e-2 + (0.12812343958540763368e-4 + (0.86506399515036435592e-7 + (0.53409440823869467453e-9 + (0.29856186620887555043e-11 + 0.14891851591111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 38: {
+        double t = 2*y100 - 77;
+        return 0.87254084284461718231e-1 + (0.17999608886001962327e-2 + (0.13344443080089492218e-4 + (0.90900994316429008631e-7 + (0.56486134972616465316e-9 + (0.31698707080033956934e-11 + 0.15825697795555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 39: {
+        double t = 2*y100 - 79;
+        return 0.90908120182172748487e-1 + (0.18544478050657699758e-2 + (0.13903663143426120077e-4 + (0.95549246062549906177e-7 + (0.59752787125242054315e-9 + (0.33656597366099099413e-11 + 0.16815130613333333333e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 40: {
+        double t = 2*y100 - 81;
+        return 0.94673404508075481121e-1 + (0.19112284419887303347e-2 + (0.14491572616545004930e-4 + (0.10046682186333613697e-6 + (0.63221272959791000515e-9 + (0.35736693975589130818e-11 + 0.17862931591111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 41: {
+        double t = 2*y100 - 83;
+        return 0.98554641648004456555e-1 + (0.19704208544725622126e-2 + (0.15109836875625443935e-4 + (0.10567036667675984067e-6 + (0.66904168640019354565e-9 + (0.37946171850824333014e-11 + 0.18971959040000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 42: {
+        double t = 2*y100 - 85;
+        return 0.10255677889470089531e0 + (0.20321499629472857418e-2 + (0.15760224242962179564e-4 + (0.11117756071353507391e-6 + (0.70814785110097658502e-9 + (0.40292553276632563925e-11 + 0.20145143075555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 43: {
+        double t = 2*y100 - 87;
+        return 0.10668502059865093318e0 + (0.20965479776148731610e-2 + (0.16444612377624983565e-4 + (0.11700717962026152749e-6 + (0.74967203250938418991e-9 + (0.42783716186085922176e-11 + 0.21385479360000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 44: {
+        double t = 2*y100 - 89;
+        return 0.11094484319386444474e0 + (0.21637548491908170841e-2 + (0.17164995035719657111e-4 + (0.12317915750735938089e-6 + (0.79376309831499633734e-9 + (0.45427901763106353914e-11 + 0.22696025653333333333e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 45: {
+        double t = 2*y100 - 91;
+        return 0.11534201115268804714e0 + (0.22339187474546420375e-2 + (0.17923489217504226813e-4 + (0.12971465288245997681e-6 + (0.84057834180389073587e-9 + (0.48233721206418027227e-11 + 0.24079890062222222222e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 46: {
+        double t = 2*y100 - 93;
+        return 0.11988259392684094740e0 + (0.23071965691918689601e-2 + (0.18722342718958935446e-4 + (0.13663611754337957520e-6 + (0.89028385488493287005e-9 + (0.51210161569225846701e-11 + 0.25540227111111111111e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 47: {
+        double t = 2*y100 - 95;
+        return 0.12457298393509812907e0 + (0.23837544771809575380e-2 + (0.19563942105711612475e-4 + (0.14396736847739470782e-6 + (0.94305490646459247016e-9 + (0.54366590583134218096e-11 + 0.27080225920000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 48: {
+        double t = 2*y100 - 97;
+        return 0.12941991566142438816e0 + (0.24637684719508859484e-2 + (0.20450821127475879816e-4 + (0.15173366280523906622e-6 + (0.99907632506389027739e-9 + (0.57712760311351625221e-11 + 0.28703099555555555556e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 49: {
+        double t = 2*y100 - 99;
+        return 0.13443048593088696613e0 + (0.25474249981080823877e-2 + (0.21385669591362915223e-4 + (0.15996177579900443030e-6 + (0.10585428844575134013e-8 + (0.61258809536787882989e-11 + 0.30412080142222222222e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 50: {
+        double t = 2*y100 - 101;
+        return 0.13961217543434561353e0 + (0.26349215871051761416e-2 + (0.22371342712572567744e-4 + (0.16868008199296822247e-6 + (0.11216596910444996246e-8 + (0.65015264753090890662e-11 + 0.32210394506666666666e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 51: {
+        double t = 2*y100 - 103;
+        return 0.14497287157673800690e0 + (0.27264675383982439814e-2 + (0.23410870961050950197e-4 + (0.17791863939526376477e-6 + (0.11886425714330958106e-8 + (0.68993039665054288034e-11 + 0.34101266222222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 52: {
+        double t = 2*y100 - 105;
+        return 0.15052089272774618151e0 + (0.28222846410136238008e-2 + (0.24507470422713397006e-4 + (0.18770927679626136909e-6 + (0.12597184587583370712e-8 + (0.73203433049229821618e-11 + 0.36087889048888888890e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 53: {
+        double t = 2*y100 - 107;
+        return 0.15626501395774612325e0 + (0.29226079376196624949e-2 + (0.25664553693768450545e-4 + (0.19808568415654461964e-6 + (0.13351257759815557897e-8 + (0.77658124891046760667e-11 + 0.38173420035555555555e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 54: {
+        double t = 2*y100 - 109;
+        return 0.16221449434620737567e0 + (0.30276865332726475672e-2 + (0.26885741326534564336e-4 + (0.20908350604346384143e-6 + (0.14151148144240728728e-8 + (0.82369170665974313027e-11 + 0.40360957457777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 55: {
+        double t = 2*y100 - 111;
+        return 0.16837910595412130659e0 + (0.31377844510793082301e-2 + (0.28174873844911175026e-4 + (0.22074043807045782387e-6 + (0.14999481055996090039e-8 + (0.87348993661930809254e-11 + 0.42653528977777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 56: {
+        double t = 2*y100 - 113;
+        return 0.17476916455659369953e0 + (0.32531815370903068316e-2 + (0.29536024347344364074e-4 + (0.23309632627767074202e-6 + (0.15899007843582444846e-8 + (0.92610375235427359475e-11 + 0.45054073102222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 57: {
+        double t = 2*y100 - 115;
+        return 0.18139556223643701364e0 + (0.33741744168096996041e-2 + (0.30973511714709500836e-4 + (0.24619326937592290996e-6 + (0.16852609412267750744e-8 + (0.98166442942854895573e-11 + 0.47565418097777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 58: {
+        double t = 2*y100 - 117;
+        return 0.18826980194443664549e0 + (0.35010775057740317997e-2 + (0.32491914440014267480e-4 + (0.26007572375886319028e-6 + (0.17863299617388376116e-8 + (0.10403065638343878679e-10 + 0.50190265831111111110e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 59: {
+        double t = 2*y100 - 119;
+        return 0.19540403413693967350e0 + (0.36342240767211326315e-2 + (0.34096085096200907289e-4 + (0.27479061117017637474e-6 + (0.18934228504790032826e-8 + (0.11021679075323598664e-10 + 0.52931171733333333334e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 60: {
+        double t = 2*y100 - 121;
+        return 0.20281109560651886959e0 + (0.37739673859323597060e-2 + (0.35791165457592409054e-4 + (0.29038742889416172404e-6 + (0.20068685374849001770e-8 + (0.11673891799578381999e-10 + 0.55790523093333333334e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 61: {
+        double t = 2*y100 - 123;
+        return 0.21050455062669334978e0 + (0.39206818613925652425e-2 + (0.37582602289680101704e-4 + (0.30691836231886877385e-6 + (0.21270101645763677824e-8 + (0.12361138551062899455e-10 + 0.58770520160000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 62: {
+        double t = 2*y100 - 125;
+        return 0.21849873453703332479e0 + (0.40747643554689586041e-2 + (0.39476163820986711501e-4 + (0.32443839970139918836e-6 + (0.22542053491518680200e-8 + (0.13084879235290858490e-10 + 0.61873153262222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 63: {
+        double t = 2*y100 - 127;
+        return 0.22680879990043229327e0 + (0.42366354648628516935e-2 + (0.41477956909656896779e-4 + (0.34300544894502810002e-6 + (0.23888264229264067658e-8 + (0.13846596292818514601e-10 + 0.65100183751111111110e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 64: {
+        double t = 2*y100 - 129;
+        return 0.23545076536988703937e0 + (0.44067409206365170888e-2 + (0.43594444916224700881e-4 + (0.36268045617760415178e-6 + (0.25312606430853202748e-8 + (0.14647791812837903061e-10 + 0.68453122631111111110e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 65: {
+        double t = 2*y100 - 131;
+        return 0.24444156740777432838e0 + (0.45855530511605787178e-2 + (0.45832466292683085475e-4 + (0.38352752590033030472e-6 + (0.26819103733055603460e-8 + (0.15489984390884756993e-10 + 0.71933206364444444445e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 66: {
+        double t = 2*y100 - 133;
+        return 0.25379911500634264643e0 + (0.47735723208650032167e-2 + (0.48199253896534185372e-4 + (0.40561404245564732314e-6 + (0.28411932320871165585e-8 + (0.16374705736458320149e-10 + 0.75541379822222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 67: {
+        double t = 2*y100 - 135;
+        return 0.26354234756393613032e0 + (0.49713289477083781266e-2 + (0.50702455036930367504e-4 + (0.42901079254268185722e-6 + (0.30095422058900481753e-8 + (0.17303497025347342498e-10 + 0.79278273368888888890e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 68: {
+        double t = 2*y100 - 137;
+        return 0.27369129607732343398e0 + (0.51793846023052643767e-2 + (0.53350152258326602629e-4 + (0.45379208848865015485e-6 + (0.31874057245814381257e-8 + (0.18277905010245111046e-10 + 0.83144182364444444445e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 69: {
+        double t = 2*y100 - 139;
+        return 0.28426714781640316172e0 + (0.53983341916695141966e-2 + (0.56150884865255810638e-4 + (0.48003589196494734238e-6 + (0.33752476967570796349e-8 + (0.19299477888083469086e-10 + 0.87139049137777777779e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 70: {
+        double t = 2*y100 - 141;
+        return 0.29529231465348519920e0 + (0.56288077305420795663e-2 + (0.59113671189913307427e-4 + (0.50782393781744840482e-6 + (0.35735475025851713168e-8 + (0.20369760937017070382e-10 + 0.91262442613333333334e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 71: {
+        double t = 2*y100 - 143;
+        return 0.30679050522528838613e0 + (0.58714723032745403331e-2 + (0.62248031602197686791e-4 + (0.53724185766200945789e-6 + (0.37827999418960232678e-8 + (0.21490291930444538307e-10 + 0.95513539182222222221e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 72: {
+        double t = 2*y100 - 145;
+        return 0.31878680111173319425e0 + (0.61270341192339103514e-2 + (0.65564012259707640976e-4 + (0.56837930287837738996e-6 + (0.40035151353392378882e-8 + (0.22662596341239294792e-10 + 0.99891109760000000000e-13 * t) * t) * t) * t) * t) * t;
+    }
+    case 73: {
+        double t = 2*y100 - 147;
+        return 0.33130773722152622027e0 + (0.63962406646798080903e-2 + (0.69072209592942396666e-4 + (0.60133006661885941812e-6 + (0.42362183765883466691e-8 + (0.23888182347073698382e-10 + 0.10439349811555555556e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 74: {
+        double t = 2*y100 - 149;
+        return 0.34438138658041336523e0 + (0.66798829540414007258e-2 + (0.72783795518603561144e-4 + (0.63619220443228800680e-6 + (0.44814499336514453364e-8 + (0.25168535651285475274e-10 + 0.10901861383111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 75: {
+        double t = 2*y100 - 151;
+        return 0.35803744972380175583e0 + (0.69787978834882685031e-2 + (0.76710543371454822497e-4 + (0.67306815308917386747e-6 + (0.47397647975845228205e-8 + (0.26505114141143050509e-10 + 0.11376390933333333333e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 76: {
+        double t = 2*y100 - 153;
+        return 0.37230734890119724188e0 + (0.72938706896461381003e-2 + (0.80864854542670714092e-4 + (0.71206484718062688779e-6 + (0.50117323769745883805e-8 + (0.27899342394100074165e-10 + 0.11862637614222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 77: {
+        double t = 2*y100 - 155;
+        return 0.38722432730555448223e0 + (0.76260375162549802745e-2 + (0.85259785810004603848e-4 + (0.75329383305171327677e-6 + (0.52979361368388119355e-8 + (0.29352606054164086709e-10 + 0.12360253370666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 78: {
+        double t = 2*y100 - 157;
+        return 0.40282355354616940667e0 + (0.79762880915029728079e-2 + (0.89909077342438246452e-4 + (0.79687137961956194579e-6 + (0.55989731807360403195e-8 + (0.30866246101464869050e-10 + 0.12868841946666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 79: {
+        double t = 2*y100 - 159;
+        return 0.41914223158913787649e0 + (0.83456685186950463538e-2 + (0.94827181359250161335e-4 + (0.84291858561783141014e-6 + (0.59154537751083485684e-8 + (0.32441553034347469291e-10 + 0.13387957943111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 80: {
+        double t = 2*y100 - 161;
+        return 0.43621971639463786896e0 + (0.87352841828289495773e-2 + (0.10002929142066799966e-3 + (0.89156148280219880024e-6 + (0.62480008150788597147e-8 + (0.34079760983458878910e-10 + 0.13917107176888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 81: {
+        double t = 2*y100 - 163;
+        return 0.45409763548534330981e0 + (0.91463027755548240654e-2 + (0.10553137232446167258e-3 + (0.94293113464638623798e-6 + (0.65972492312219959885e-8 + (0.35782041795476563662e-10 + 0.14455745872000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 82: {
+        double t = 2*y100 - 165;
+        return 0.47282001668512331468e0 + (0.95799574408860463394e-2 + (0.11135019058000067469e-3 + (0.99716373005509038080e-6 + (0.69638453369956970347e-8 + (0.37549499088161345850e-10 + 0.15003280712888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 83: {
+        double t = 2*y100 - 167;
+        return 0.49243342227179841649e0 + (0.10037550043909497071e-1 + (0.11750334542845234952e-3 + (0.10544006716188967172e-5 + (0.73484461168242224872e-8 + (0.39383162326435752965e-10 + 0.15559069118222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 84: {
+        double t = 2*y100 - 169;
+        return 0.51298708979209258326e0 + (0.10520454564612427224e-1 + (0.12400930037494996655e-3 + (0.11147886579371265246e-5 + (0.77517184550568711454e-8 + (0.41283980931872622611e-10 + 0.16122419680000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 85: {
+        double t = 2*y100 - 171;
+        return 0.53453307979101369843e0 + (0.11030120618800726938e-1 + (0.13088741519572269581e-3 + (0.11784797595374515432e-5 + (0.81743383063044825400e-8 + (0.43252818449517081051e-10 + 0.16692592640000000000e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 86: {
+        double t = 2*y100 - 173;
+        return 0.55712643071169299478e0 + (0.11568077107929735233e-1 + (0.13815797838036651289e-3 + (0.12456314879260904558e-5 + (0.86169898078969313597e-8 + (0.45290446811539652525e-10 + 0.17268801084444444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 87: {
+        double t = 2*y100 - 175;
+        return 0.58082532122519320968e0 + (0.12135935999503877077e-1 + (0.14584223996665838559e-3 + (0.13164068573095710742e-5 + (0.90803643355106020163e-8 + (0.47397540713124619155e-10 + 0.17850211608888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 88: {
+        double t = 2*y100 - 177;
+        return 0.60569124025293375554e0 + (0.12735396239525550361e-1 + (0.15396244472258863344e-3 + (0.13909744385382818253e-5 + (0.95651595032306228245e-8 + (0.49574672127669041550e-10 + 0.18435945564444444444e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 89: {
+        double t = 2*y100 - 179;
+        return 0.63178916494715716894e0 + (0.13368247798287030927e-1 + (0.16254186562762076141e-3 + (0.14695084048334056083e-5 + (0.10072078109604152350e-7 + (0.51822304995680707483e-10 + 0.19025081422222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 90: {
+        double t = 2*y100 - 181;
+        return 0.65918774689725319200e0 + (0.14036375850601992063e-1 + (0.17160483760259706354e-3 + (0.15521885688723188371e-5 + (0.10601827031535280590e-7 + (0.54140790105837520499e-10 + 0.19616655146666666667e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 91: {
+        double t = 2*y100 - 183;
+        return 0.68795950683174433822e0 + (0.14741765091365869084e-1 + (0.18117679143520433835e-3 + (0.16392004108230585213e-5 + (0.11155116068018043001e-7 + (0.56530360194925690374e-10 + 0.20209663662222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 92: {
+        double t = 2*y100 - 185;
+        return 0.71818103808729967036e0 + (0.15486504187117112279e-1 + (0.19128428784550923217e-3 + (0.17307350969359975848e-5 + (0.11732656736113607751e-7 + (0.58991125287563833603e-10 + 0.20803065333333333333e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 93: {
+        double t = 2*y100 - 187;
+        return 0.74993321911726254661e0 + (0.16272790364044783382e-1 + (0.20195505163377912645e-3 + (0.18269894883203346953e-5 + (0.12335161021630225535e-7 + (0.61523068312169087227e-10 + 0.21395783431111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 94: {
+        double t = 2*y100 - 189;
+        return 0.78330143531283492729e0 + (0.17102934132652429240e-1 + (0.21321800585063327041e-3 + (0.19281661395543913713e-5 + (0.12963340087354341574e-7 + (0.64126040998066348872e-10 + 0.21986708942222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 95: {
+        double t = 2*y100 - 191;
+        return 0.81837581041023811832e0 + (0.17979364149044223802e-1 + (0.22510330592753129006e-3 + (0.20344732868018175389e-5 + (0.13617902941839949718e-7 + (0.66799760083972474642e-10 + 0.22574701262222222222e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 96: {
+        double t = 2*y100 - 193;
+        return 0.85525144775685126237e0 + (0.18904632212547561026e-1 + (0.23764237370371255638e-3 + (0.21461248251306387979e-5 + (0.14299555071870523786e-7 + (0.69543803864694171934e-10 + 0.23158593688888888889e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 97: {
+        double t = 2*y100 - 195;
+        return 0.89402868170849933734e0 + (0.19881418399127202569e-1 + (0.25086793128395995798e-3 + (0.22633402747585233180e-5 + (0.15008997042116532283e-7 + (0.72357609075043941261e-10 + 0.23737194737777777778e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 98: {
+        double t = 2*y100 - 197;
+        return 0.93481333942870796363e0 + (0.20912536329780368893e-1 + (0.26481403465998477969e-3 + (0.23863447359754921676e-5 + (0.15746923065472184451e-7 + (0.75240468141720143653e-10 + 0.24309291271111111111e-12 * t) * t) * t) * t) * t) * t;
+    }
+    case 99: {
+        double t = 2*y100 - 199;
+        return 0.97771701335885035464e0 + (0.22000938572830479551e-1 + (0.27951610702682383001e-3 + (0.25153688325245314530e-5 + (0.16514019547822821453e-7 + (0.78191526829368231251e-10 + 0.24873652355555555556e-12 * t) * t) * t) * t) * t) * t;
+    }
+    }
+    /* we only get here if y = 1, i.e. |x| < 4*eps, in which case
+     * erfcx is within 1e-15 of 1.. */
+    return 1.0;
+} /* erfcx_y100 */
+
+/* optimizer friendly implementation of exp2(x).
+ *
+ * strategy:
+ *
+ * split argument into an integer part and a fraction:
+ * ipart = floor(x+0.5);
+ * fpart = x - ipart;
+ *
+ * compute exp2(ipart) from setting the ieee754 exponent
+ * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[
+ *
+ * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart)
+ */
+
+/* IEEE 754 double precision floating point data manipulation */
+typedef union
+{
+    double   f;
+    uint64_t u;
+    struct {int32_t  i0,i1;} s;
+}  udi_t;
+
+static const double fm_exp2_q[] = {
+/*  1.00000000000000000000e0, */
+    2.33184211722314911771e2,
+    4.36821166879210612817e3
+};
+static const double fm_exp2_p[] = {
+    2.30933477057345225087e-2,
+    2.02020656693165307700e1,
+    1.51390680115615096133e3
+};
+
+double MathSpecialKokkos::exp2_x86(double x)
+{
+    double   ipart, fpart, px, qx;
+    udi_t    epart;
+
+    ipart = floor(x+0.5);
+    fpart = x - ipart;
+    epart.s.i0 = 0;
+    epart.s.i1 = (((int) ipart) + 1023) << 20;
+
+    x = fpart*fpart;
+
+    px =        fm_exp2_p[0];
+    px = px*x + fm_exp2_p[1];
+    qx =    x + fm_exp2_q[0];
+    px = px*x + fm_exp2_p[2];
+    qx = qx*x + fm_exp2_q[1];
+
+    px = px * fpart;
+
+    x = 1.0 + 2.0*(px/(qx-px));
+    return epart.f*x;
+}
diff --git a/src/KOKKOS/math_special_kokkos.h b/src/KOKKOS/math_special_kokkos.h
new file mode 100644
index 0000000000..c177e88574
--- /dev/null
+++ b/src/KOKKOS/math_special_kokkos.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MATH_SPECIAL_KOKKOS_H
+#define LMP_MATH_SPECIAL_KOKKOS_H
+
+#include <math.h>
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+namespace MathSpecialKokkos {
+
+  // support function for scaled error function complement
+
+  extern double erfcx_y100(const double y100);
+
+  // fast 2**x function without argument checks for little endian CPUs
+  extern double exp2_x86(double x);
+
+  // scaled error function complement exp(x*x)*erfc(x) for coul/long styles
+
+  static inline double my_erfcx(const double x)
+  {
+    if (x >= 0.0) return erfcx_y100(400.0/(4.0+x));
+    else return 2.0*exp(x*x) - erfcx_y100(400.0/(4.0-x));
+  }
+
+  // exp(-x*x) for coul/long styles
+  
+  static inline double expmsq(double x)
+  {
+    x *= x;
+    x *= 1.4426950408889634074; // log_2(e)
+#if defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    return (x < 1023.0) ? exp2_x86(-x) : 0.0;
+#endif
+#endif
+    return (x < 1023.0) ? exp2(-x) : 0.0;
+  }
+
+  // x**2, use instead of pow(x,2.0)
+  KOKKOS_INLINE_FUNCTION
+  static double square(const double &x) { return x*x; }
+
+  // x**3, use instead of pow(x,3.0)
+  KOKKOS_INLINE_FUNCTION
+  static double cube(const double &x) { return x*x*x; }
+
+  // return -1.0 for odd n, 1.0 for even n, like pow(-1.0,n)
+  KOKKOS_INLINE_FUNCTION
+  static double powsign(const int n) { return (n & 1) ? -1.0 : 1.0; }
+
+  // optimized version of pow(x,n) with n being integer
+  // up to 10x faster than pow(x,y)
+
+  KOKKOS_INLINE_FUNCTION
+  static double powint(const double &x, const int n) {
+    double yy,ww;
+
+    if (x == 0.0) return 0.0;
+    int nn = (n > 0) ? n : -n;
+    ww = x;
+
+    for (yy = 1.0; nn != 0; nn >>= 1, ww *=ww)
+      if (nn & 1) yy *= ww;
+
+    return (n > 0) ? yy : 1.0/yy;
+  }
+
+  // optimized version of (sin(x)/x)**n with n being a _positive_ integer
+
+  KOKKOS_INLINE_FUNCTION
+  static double powsinxx(const double &x, int n) {
+    double yy,ww;
+
+    if (x == 0.0) return 1.0;
+
+    ww = sin(x)/x;
+
+    for (yy = 1.0; n != 0; n >>= 1, ww *=ww)
+      if (n & 1) yy *= ww;
+
+    return yy;
+  }
+}
+}
+
+#endif
diff --git a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp
index c749b85f3c..3b2b13f40b 100644
--- a/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp
+++ b/src/KOKKOS/pair_lj_charmm_coul_long_kokkos.cpp
@@ -115,6 +115,19 @@ void PairLJCharmmCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
   atomKK->sync(execution_space,datamask_read);
   k_cutsq.template sync<DeviceType>();
   k_cut_ljsq.template sync<DeviceType>();
@@ -169,6 +182,16 @@ void PairLJCharmmCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 
   if (vflag_fdotr) pair_virial_fdotr_compute(this);
 
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
   copymode = 0;
 }
 
diff --git a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp
index 00d1561bc3..2a1a124460 100644
--- a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp
+++ b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.cpp
@@ -68,6 +68,10 @@ PairLJCutCoulLongKokkos<DeviceType>::PairLJCutCoulLongKokkos(LAMMPS *lmp):PairLJ
 template<class DeviceType>
 PairLJCutCoulLongKokkos<DeviceType>::~PairLJCutCoulLongKokkos()
 {
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+  eatom = NULL;
+  vatom = NULL;
   if (allocated){
     memory->destroy_kokkos(k_cutsq, cutsq);
     memory->destroy_kokkos(k_cut_ljsq, cut_ljsq);
@@ -100,6 +104,20 @@ void PairLJCutCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
   atomKK->sync(execution_space,datamask_read);
   k_cutsq.template sync<DeviceType>();
   k_cut_ljsq.template sync<DeviceType>();
@@ -151,6 +169,16 @@ void PairLJCutCoulLongKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   }
 
   if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h
index 9542853839..ed35c6d471 100644
--- a/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h
+++ b/src/KOKKOS/pair_lj_cut_coul_long_kokkos.h
@@ -87,6 +87,9 @@ class PairLJCutCoulLongKokkos : public PairLJCutCoulLong {
   typename ArrayTypes<DeviceType>::t_f_array f;
   typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
   typename ArrayTypes<DeviceType>::t_float_1d_randomread q;
+  
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
   typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
   typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
 
diff --git a/src/KOKKOS/pair_reax_c_kokkos.cpp b/src/KOKKOS/pair_reax_c_kokkos.cpp
index eba6fbb0b5..ea898843e7 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.cpp
+++ b/src/KOKKOS/pair_reax_c_kokkos.cpp
@@ -660,6 +660,8 @@ void PairReaxCKokkos<DeviceType>::LR_vdW_Coulomb( int i, int j, double r_ij, LR_
 template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
+  copymode = 1;
+
   bocnt = hbcnt = 0;
 
   eflag = eflag_in;
@@ -703,8 +705,6 @@ void PairReaxCKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
       pvector[i] = 0.0;
   }
 
-  copymode = 1;
-
   EV_FLOAT_REAX ev;
   EV_FLOAT_REAX ev_all;
 
@@ -1363,6 +1363,23 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxZero, const int &n) const {
     d_dDeltap_self(n,j) = 0.0;
 }
 
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairReaxCKokkos<DeviceType>::operator()(PairReaxZeroEAtom, const int &i) const {
+  v_eatom(i) = 0.0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairReaxCKokkos<DeviceType>::operator()(PairReaxZeroVAtom, const int &i) const {
+  v_vatom(i,0) = 0.0;
+  v_vatom(i,1) = 0.0;
+  v_vatom(i,2) = 0.0;
+  v_vatom(i,3) = 0.0;
+  v_vatom(i,4) = 0.0;
+  v_vatom(i,5) = 0.0;
+}
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -2398,7 +2415,7 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeAngular<NEIGHFLAG,EV
   F_FLOAT BOA_ij, BOA_ik, rij, bo_ij, bo_ik;
   F_FLOAT dcos_theta_di[3], dcos_theta_dj[3], dcos_theta_dk[3];
   F_FLOAT eng_tmp, fi_tmp[3], fj_tmp[3], fk_tmp[3];
-  F_FLOAT delij[3], delik[3];
+  F_FLOAT delij[3], delik[3], delji[3], delki[3];
 
   p_val6 = gp[14];
   p_val8 = gp[33];
@@ -2648,8 +2665,10 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeAngular<NEIGHFLAG,EV
       if (EVFLAG) {
         eng_tmp = e_ang + e_pen + e_coa;
         //if (eflag_atom) this->template ev_tally<NEIGHFLAG>(ev,i,j,eng_tmp,0.0,0.0,0.0,0.0);
+        for (int d = 0; d < 3; d++) delki[d] = -1.0 * delik[d];
+        for (int d = 0; d < 3; d++) delji[d] = -1.0 * delij[d];
         if (eflag_atom) this->template e_tally<NEIGHFLAG>(ev,i,j,eng_tmp);
-        if (vflag_either) this->template v_tally3<NEIGHFLAG>(ev,i,j,k,fj_tmp,fk_tmp,delij,delik);
+        if (vflag_either) this->template v_tally3<NEIGHFLAG>(ev,i,j,k,fj_tmp,fk_tmp,delji,delki);
       }
 
     }
@@ -2884,31 +2903,34 @@ void PairReaxCKokkos<DeviceType>::operator()(PairReaxComputeTorsion<NEIGHFLAG,EV
         if( arg >  1.0 ) arg =  1.0;
         if( arg < -1.0 ) arg = -1.0;
 
-        if( sin_ijk >= 0 && sin_ijk <= 1e-10 ) sin_ijk = 1e-10;
-        else if( sin_ijk <= 0 && sin_ijk >= -1e-10 ) sin_ijk = -1e-10;
-        if( sin_jil >= 0 && sin_jil <= 1e-10 ) sin_jil = 1e-10;
-        else if( sin_jil <= 0 && sin_jil >= -1e-10 ) sin_jil = -1e-10;
+        F_FLOAT sin_ijk_rnd = sin_ijk;
+        F_FLOAT sin_jil_rnd = sin_jil;
+
+        if( sin_ijk >= 0 && sin_ijk <= 1e-10 ) sin_ijk_rnd = 1e-10;
+        else if( sin_ijk <= 0 && sin_ijk >= -1e-10 ) sin_ijk_rnd = -1e-10;
+        if( sin_jil >= 0 && sin_jil <= 1e-10 ) sin_jil_rnd = 1e-10;
+        else if( sin_jil <= 0 && sin_jil >= -1e-10 ) sin_jil_rnd = -1e-10;
 
         // dcos_omega_di
         for (int d = 0; d < 3; d++) dcos_omega_dk[d] = ((htra-arg*hnra)/rik) * delik[d] - dellk[d];
-        for (int d = 0; d < 3; d++) dcos_omega_dk[d] += (hthd-arg*hnhd)/sin_ijk * -dcos_ijk_dk[d];
+        for (int d = 0; d < 3; d++) dcos_omega_dk[d] += (hthd-arg*hnhd)/sin_ijk_rnd * -dcos_ijk_dk[d];
         for (int d = 0; d < 3; d++) dcos_omega_dk[d] *= 2.0/poem;
 
         // dcos_omega_dj
         for (int d = 0; d < 3; d++) dcos_omega_di[d] = -((htra-arg*hnra)/rik) * delik[d] - htrb/rij * delij[d];
-        for (int d = 0; d < 3; d++) dcos_omega_di[d] += -(hthd-arg*hnhd)/sin_ijk * dcos_ijk_di[d];
-        for (int d = 0; d < 3; d++) dcos_omega_di[d] += -(hthe-arg*hnhe)/sin_jil * dcos_jil_di[d];
+        for (int d = 0; d < 3; d++) dcos_omega_di[d] += -(hthd-arg*hnhd)/sin_ijk_rnd * dcos_ijk_di[d];
+        for (int d = 0; d < 3; d++) dcos_omega_di[d] += -(hthe-arg*hnhe)/sin_jil_rnd * dcos_jil_di[d];
         for (int d = 0; d < 3; d++) dcos_omega_di[d] *= 2.0/poem;
 
         // dcos_omega_dk
         for (int d = 0; d < 3; d++) dcos_omega_dj[d] = -((htrc-arg*hnrc)/rjl) * deljl[d] + htrb/rij * delij[d];
-        for (int d = 0; d < 3; d++) dcos_omega_dj[d] += -(hthd-arg*hnhd)/sin_ijk * dcos_ijk_dj[d];
-        for (int d = 0; d < 3; d++) dcos_omega_dj[d] += -(hthe-arg*hnhe)/sin_jil * dcos_jil_dj[d];
+        for (int d = 0; d < 3; d++) dcos_omega_dj[d] += -(hthd-arg*hnhd)/sin_ijk_rnd * dcos_ijk_dj[d];
+        for (int d = 0; d < 3; d++) dcos_omega_dj[d] += -(hthe-arg*hnhe)/sin_jil_rnd * dcos_jil_dj[d];
         for (int d = 0; d < 3; d++) dcos_omega_dj[d] *= 2.0/poem;
 
         // dcos_omega_dl
         for (int d = 0; d < 3; d++) dcos_omega_dl[d] = ((htrc-arg*hnrc)/rjl) * deljl[d] + dellk[d];
-        for (int d = 0; d < 3; d++) dcos_omega_dl[d] += (hthe-arg*hnhe)/sin_jil * -dcos_jil_dk[d];
+        for (int d = 0; d < 3; d++) dcos_omega_dl[d] += (hthe-arg*hnhe)/sin_jil_rnd * -dcos_jil_dk[d];
         for (int d = 0; d < 3; d++) dcos_omega_dl[d] *= 2.0/poem;
 
         cos_omega = cos( omega );
@@ -3728,12 +3750,12 @@ void PairReaxCKokkos<DeviceType>::v_tally3(EV_FLOAT_REAX &ev, const int &i, cons
 
   F_FLOAT v[6];
 
-  v[0] = (drij[0]*fj[0] + drik[0]*fk[0]);
-  v[1] = (drij[1]*fj[1] + drik[1]*fk[1]);
-  v[2] = (drij[2]*fj[2] + drik[2]*fk[2]);
-  v[3] = (drij[0]*fj[1] + drik[0]*fk[1]);
-  v[4] = (drij[0]*fj[2] + drik[0]*fk[2]);
-  v[5] = (drij[1]*fj[2] + drik[1]*fk[2]);
+  v[0] = drij[0]*fj[0] + drik[0]*fk[0];
+  v[1] = drij[1]*fj[1] + drik[1]*fk[1];
+  v[2] = drij[2]*fj[2] + drik[2]*fk[2];
+  v[3] = drij[0]*fj[1] + drik[0]*fk[1];
+  v[4] = drij[0]*fj[2] + drik[0]*fk[2];
+  v[5] = drij[1]*fj[2] + drik[1]*fk[2];
 
   if (vflag_global) {
     ev.v[0] += v[0];
@@ -3745,12 +3767,12 @@ void PairReaxCKokkos<DeviceType>::v_tally3(EV_FLOAT_REAX &ev, const int &i, cons
   }
 
   if (vflag_atom) {
-    a_vatom(i,0) += THIRD*v[0]; a_vatom(i,1) += THIRD*v[1]; a_vatom(i,2) += THIRD*v[2];
-    a_vatom(i,3) += THIRD*v[3]; a_vatom(i,4) += THIRD*v[4]; a_vatom(i,5) += THIRD*v[5];
-    a_vatom(j,0) += THIRD*v[0]; a_vatom(j,1) += THIRD*v[1]; a_vatom(j,2) += THIRD*v[2];
-    a_vatom(j,3) += THIRD*v[3]; a_vatom(j,4) += THIRD*v[4]; a_vatom(j,5) += THIRD*v[5];
-    a_vatom(k,0) += THIRD*v[0]; a_vatom(k,1) += THIRD*v[1]; a_vatom(k,2) += THIRD*v[2];
-    a_vatom(k,3) += THIRD*v[3]; a_vatom(k,4) += THIRD*v[4]; a_vatom(k,5) += THIRD*v[5];
+    a_vatom(i,0) += THIRD * v[0]; a_vatom(i,1) += THIRD * v[1]; a_vatom(i,2) += THIRD * v[2];
+    a_vatom(i,3) += THIRD * v[3]; a_vatom(i,4) += THIRD * v[4]; a_vatom(i,5) += THIRD * v[5];
+    a_vatom(j,0) += THIRD * v[0]; a_vatom(j,1) += THIRD * v[1]; a_vatom(j,2) += THIRD * v[2];
+    a_vatom(j,3) += THIRD * v[3]; a_vatom(j,4) += THIRD * v[4]; a_vatom(j,5) += THIRD * v[5];
+    a_vatom(k,0) += THIRD * v[0]; a_vatom(k,1) += THIRD * v[1]; a_vatom(k,2) += THIRD * v[2];
+    a_vatom(k,3) += THIRD * v[3]; a_vatom(k,4) += THIRD * v[4]; a_vatom(k,5) += THIRD * v[5];
   }
 
 }
@@ -3767,12 +3789,12 @@ void PairReaxCKokkos<DeviceType>::v_tally4(EV_FLOAT_REAX &ev, const int &i, cons
   // The vatom array is atomic for Half/Thread neighbor style
   F_FLOAT v[6];
 
-  v[0] = 0.25 * (dril[0]*fi[0] + drjl[0]*fj[0] + drkl[0]*fk[0]);
-  v[1] = 0.25 * (dril[1]*fi[1] + drjl[1]*fj[1] + drkl[1]*fk[1]);
-  v[2] = 0.25 * (dril[2]*fi[2] + drjl[2]*fj[2] + drkl[2]*fk[2]);
-  v[3] = 0.25 * (dril[0]*fi[1] + drjl[0]*fj[1] + drkl[0]*fk[1]);
-  v[4] = 0.25 * (dril[0]*fi[2] + drjl[0]*fj[2] + drkl[0]*fk[2]);
-  v[5] = 0.25 * (dril[1]*fi[2] + drjl[1]*fj[2] + drkl[1]*fk[2]);
+  v[0] = dril[0]*fi[0] + drjl[0]*fj[0] + drkl[0]*fk[0];
+  v[1] = dril[1]*fi[1] + drjl[1]*fj[1] + drkl[1]*fk[1];
+  v[2] = dril[2]*fi[2] + drjl[2]*fj[2] + drkl[2]*fk[2];
+  v[3] = dril[0]*fi[1] + drjl[0]*fj[1] + drkl[0]*fk[1];
+  v[4] = dril[0]*fi[2] + drjl[0]*fj[2] + drkl[0]*fk[2];
+  v[5] = dril[1]*fi[2] + drjl[1]*fj[2] + drkl[1]*fk[2];
 
   if (vflag_global) {
     ev.v[0] += v[0];
@@ -3785,14 +3807,14 @@ void PairReaxCKokkos<DeviceType>::v_tally4(EV_FLOAT_REAX &ev, const int &i, cons
 
   if (vflag_atom) {
     Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = v_vatom;
-    a_vatom(i,0) += v[0]; a_vatom(i,1) += v[1]; a_vatom(i,2) += v[2];
-    a_vatom(i,3) += v[3]; a_vatom(i,4) += v[4]; a_vatom(i,5) += v[5];
-    a_vatom(j,0) += v[0]; a_vatom(j,1) += v[1]; a_vatom(j,2) += v[2];
-    a_vatom(j,3) += v[3]; a_vatom(j,4) += v[4]; a_vatom(j,5) += v[5];
-    a_vatom(k,0) += v[0]; a_vatom(k,1) += v[1]; a_vatom(k,2) += v[2];
-    a_vatom(k,3) += v[3]; a_vatom(k,4) += v[4]; a_vatom(k,5) += v[5];
-    a_vatom(l,0) += v[0]; a_vatom(l,1) += v[1]; a_vatom(l,2) += v[2];
-    a_vatom(l,3) += v[3]; a_vatom(l,4) += v[4]; a_vatom(l,5) += v[5];
+    a_vatom(i,0) += 0.25 * v[0]; a_vatom(i,1) += 0.25 * v[1]; a_vatom(i,2) += 0.25 * v[2];
+    a_vatom(i,3) += 0.25 * v[3]; a_vatom(i,4) += 0.25 * v[4]; a_vatom(i,5) += 0.25 * v[5];
+    a_vatom(j,0) += 0.25 * v[0]; a_vatom(j,1) += 0.25 * v[1]; a_vatom(j,2) += 0.25 * v[2];
+    a_vatom(j,3) += 0.25 * v[3]; a_vatom(j,4) += 0.25 * v[4]; a_vatom(j,5) += 0.25 * v[5];
+    a_vatom(k,0) += 0.25 * v[0]; a_vatom(k,1) += 0.25 * v[1]; a_vatom(k,2) += 0.25 * v[2];
+    a_vatom(k,3) += 0.25 * v[3]; a_vatom(k,4) += 0.25 * v[4]; a_vatom(k,5) += 0.25 * v[5];
+    a_vatom(l,0) += 0.25 * v[0]; a_vatom(l,1) += 0.25 * v[1]; a_vatom(l,2) += 0.25 * v[2];
+    a_vatom(l,3) += 0.25 * v[3]; a_vatom(l,4) += 0.25 * v[4]; a_vatom(l,5) += 0.25 * v[5];
   }
 
 }
@@ -3894,6 +3916,14 @@ void PairReaxCKokkos<DeviceType>::ev_setup(int eflag, int vflag)
 
   if (eflag_global) eng_vdwl = eng_coul = 0.0;
   if (vflag_global) for (i = 0; i < 6; i++) virial[i] = 0.0;
+  if (eflag_atom) {
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxZeroEAtom>(0,maxeatom),*this);
+    DeviceType::fence();
+  }
+  if (vflag_atom) {
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, PairReaxZeroVAtom>(0,maxvatom),*this);
+    DeviceType::fence();
+  }
 
   // if vflag_global = 2 and pair::compute() calls virial_fdotr_compute()
   // compute global virial via (F dot r) instead of via pairwise summation
diff --git a/src/KOKKOS/pair_reax_c_kokkos.h b/src/KOKKOS/pair_reax_c_kokkos.h
index 8bba6c50d0..28166df3bd 100644
--- a/src/KOKKOS/pair_reax_c_kokkos.h
+++ b/src/KOKKOS/pair_reax_c_kokkos.h
@@ -84,6 +84,10 @@ struct PairReaxBuildListsHalf_LessAtomics{};
 
 struct PairReaxZero{};
 
+struct PairReaxZeroEAtom{};
+
+struct PairReaxZeroVAtom{};
+
 struct PairReaxBondOrder1{};
 
 struct PairReaxBondOrder1_LessAtomics{};
@@ -172,6 +176,12 @@ class PairReaxCKokkos : public PairReaxC {
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxZero, const int&) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(PairReaxZeroEAtom, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(PairReaxZeroVAtom, const int&) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator()(PairReaxBondOrder1, const int&) const;
 
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
new file mode 100644
index 0000000000..de9c0ae630
--- /dev/null
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -0,0 +1,3193 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "pppm_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm.h"
+#include "gridcomm_kokkos.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "atom_masks.h"
+
+#include "math_const.h"
+#include "math_special_kokkos.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecialKokkos;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define LARGE 10000.0
+#define SMALL 0.00001
+#define EPS_HOC 1.0e-7
+
+enum{REVERSE_RHO};
+enum{FORWARD_IK,FORWARD_IK_PERATOM};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PPPMKokkos<DeviceType>::PPPMKokkos(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
+{
+  if (narg < 1) error->all(FLERR,"Illegal kspace_style pppm command");
+
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | Q_MASK;
+  datamask_modify = F_MASK;
+
+  pppmflag = 1;
+  group_group_enable = 0;
+  triclinic_support = 0;
+
+  accuracy_relative = fabs(force->numeric(FLERR,arg[0]));
+
+  nfactors = 3;
+  //factors = new int[nfactors];
+  factors[0] = 2;
+  factors[1] = 3;
+  factors[2] = 5;
+
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  //density_brick = d_vdx_brick = d_vdy_brick = d_vdz_brick = NULL;
+  //d_density_fft = NULL;
+  //d_u_brick = NULL;
+  //d_v0_brick = d_v1_brick = d_v2_brick = d_v3_brick = d_v4_brick = d_v5_brick = NULL;
+  //greensfn = NULL;
+  //d_work1 = d_work2 = NULL;
+  //vg = NULL;
+  //d_fkx = d_fky = d_fkz = NULL;
+
+
+  //gf_b = NULL;
+  //rho1d = rho_coeff = drho1d = drho_coeff = NULL;
+
+  fft1 = fft2 = NULL;
+  remap = NULL;
+  cg = NULL;
+  cg_peratom = NULL;
+
+  nmax = 0;
+  //part2grid = NULL;
+
+  peratom_allocate_flag = 0;
+
+  // define acons coefficients for estimation of kspace errors
+  // see JCP 109, pg 7698 for derivation of coefficients
+  // higher order coefficients may be computed if needed
+
+  acons = typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,LMPDeviceType>::t_host("pppm:acons");
+  acons(1,0) = 2.0 / 3.0;
+  acons(2,0) = 1.0 / 50.0;
+  acons(2,1) = 5.0 / 294.0;
+  acons(3,0) = 1.0 / 588.0;
+  acons(3,1) = 7.0 / 1440.0;
+  acons(3,2) = 21.0 / 3872.0;
+  acons(4,0) = 1.0 / 4320.0;
+  acons(4,1) = 3.0 / 1936.0;
+  acons(4,2) = 7601.0 / 2271360.0;
+  acons(4,3) = 143.0 / 28800.0;
+  acons(5,0) = 1.0 / 23232.0;
+  acons(5,1) = 7601.0 / 13628160.0;
+  acons(5,2) = 143.0 / 69120.0;
+  acons(5,3) = 517231.0 / 106536960.0;
+  acons(5,4) = 106640677.0 / 11737571328.0;
+  acons(6,0) = 691.0 / 68140800.0;
+  acons(6,1) = 13.0 / 57600.0;
+  acons(6,2) = 47021.0 / 35512320.0;
+  acons(6,3) = 9694607.0 / 2095994880.0;
+  acons(6,4) = 733191589.0 / 59609088000.0;
+  acons(6,5) = 326190917.0 / 11700633600.0;
+  acons(7,0) = 1.0 / 345600.0;
+  acons(7,1) = 3617.0 / 35512320.0;
+  acons(7,2) = 745739.0 / 838397952.0;
+  acons(7,3) = 56399353.0 / 12773376000.0;
+  acons(7,4) = 25091609.0 / 1560084480.0;
+  acons(7,5) = 1755948832039.0 / 36229939200000.0;
+  acons(7,6) = 4887769399.0 / 37838389248.0;
+
+  k_flag = DAT::tdual_int_scalar("PPPM:flag");
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+PPPMKokkos<DeviceType>::~PPPMKokkos()
+{
+  if (copymode) return;
+
+  //delete [] factors;
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  //memory->destroy(part2grid);
+  //memory->destroy(acons);
+  
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+  eatom = NULL;
+  vatom = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::init()
+{
+  if (me == 0) {
+    if (screen) fprintf(screen,"PPPM initialization ...\n");
+    if (logfile) fprintf(logfile,"PPPM initialization ...\n");
+  }
+
+  // error check
+
+  if (differentiation_flag == 1)
+    error->all(FLERR,"Cannot (yet) use PPPM Kokkos with 'kspace_modify diff ad'");
+
+  triclinic_check();
+  if (domain->triclinic && slabflag)
+    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and "
+               "slab correction");
+  if (domain->dimension == 2) error->all(FLERR,
+                                         "Cannot use PPPM with 2d simulation");
+  if (comm->style != 0)
+    error->universe_all(FLERR,"PPPM can only currently be used with "
+                        "comm_style brick");
+
+  if (!atomKK->q_flag) error->all(FLERR,"Kspace style requires atomKK attribute q");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPM");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPM");
+  }
+
+  if (order < 2 || order > MAXORDER) {
+    char str[128];
+    sprintf(str,"PPPM order cannot be < 2 or > than %d",MAXORDER);
+    error->all(FLERR,str);
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  pair_check();
+
+  int itmp = 0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // if kspace is TIP4P, extract TIP4P params from pair style
+  // bond/angle are not yet init(), so insure equilibrium request is valid
+
+  qdist = 0.0;
+
+  if (tip4pflag)
+      error->all(FLERR,"Cannot (yet) use PPPM Kokkos TIP4P");
+
+  // compute qsum & qsqsum and warn if not charge-neutral
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  qsum_qsq();
+  natoms_original = atomKK->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil does not extend beyond neighbor proc
+  //   or overlap is allowed, then done
+  // else reduce order and try again
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  GridCommKokkos<DeviceType> *cgtmp = NULL;
+  int iteration = 0;
+
+  while (order >= minorder) {
+    if (iteration && me == 0)
+      error->warning(FLERR,"Reducing PPPM order b/c stencil extends "
+                     "beyond nearest neighbor processor");
+
+    set_grid_global();
+    set_grid_local();
+    if (overlap_allowed) break;
+
+    cgtmp = new GridCommKokkos<DeviceType>(lmp,world,1,1,
+                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    cgtmp->ghost_notify();
+    if (!cgtmp->ghost_overlap()) break;
+    delete cgtmp;
+
+    order--;
+    iteration++;
+  }
+
+  if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order");
+  if (!overlap_allowed && cgtmp->ghost_overlap())
+    error->all(FLERR,"PPPM grid stencil extends "
+               "beyond nearest neighbor processor");
+  if (cgtmp) delete cgtmp;
+
+  // adjust g_ewald
+
+  if (!gewaldflag) adjust_gewald();
+
+  // calculate the final accuracy
+
+  double estimated_accuracy = final_accuracy();
+
+  // print stats
+
+  int ngrid_max,nfft_both_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+
+#ifdef FFT_SINGLE
+    const char fft_prec[] = "single";
+#else
+    const char fft_prec[] = "double";
+#endif
+
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(screen,"  stencil order = %d\n",order);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  using %s precision FFTs\n",fft_prec);
+      fprintf(screen,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(logfile,"  stencil order = %d\n",order);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  using %s precision FFTs\n",fft_prec);
+      fprintf(logfile,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+  }
+
+  // allocate K-space dependent memory
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+  cg->ghost_notify();
+  cg->setup();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+
+  k_rho_coeff.template modify<LMPHostType>();
+  k_rho_coeff.template sync<DeviceType>();
+
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::setup()
+{
+  if (triclinic) {
+    setup_triclinic();
+    return;
+  }
+
+  // perform some checks to avoid illegal boundaries with read_data
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPM");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPM");
+  }
+
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  unitkx = (MY_2PI/xprd);
+  unitky = (MY_2PI/yprd);
+  unitkz = (MY_2PI/zprd_slab);
+
+  // d_fkx,d_fky,d_fkz for my FFT grid pts
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup1>(nxlo_fft,nxhi_fft+1),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup2>(nylo_fft,nyhi_fft+1),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup3>(nzlo_fft,nzhi_fft+1),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  // merge three outer loops into one for better threading
+
+  numz_fft = nzhi_fft-nzlo_fft + 1;
+  numy_fft = nyhi_fft-nylo_fft + 1;
+  numx_fft = nxhi_fft-nxlo_fft + 1;
+  const int inum_fft = numz_fft*numy_fft*numx_fft;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup4>(0,inum_fft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  compute_gf_ik();
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup1, const int &i) const
+{
+  double per = i - nx_pppm*(2*i/nx_pppm);
+  d_fkx[i-nxlo_fft] = unitkx*per;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup2, const int &i) const
+{
+  double per = i - ny_pppm*(2*i/ny_pppm);
+  d_fky[i-nylo_fft] = unitky*per;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup3, const int &i) const
+{
+  double per = i - nz_pppm*(2*i/nz_pppm);
+  d_fkz[i-nzlo_fft] = unitkz*per;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup4, const int &n) const
+{
+  const int k = n/(numy_fft*numx_fft);
+  const int j = (n - k*numy_fft*numx_fft) / numx_fft;
+  const int i = n - k*numy_fft*numx_fft - j*numx_fft;
+  const double sqk = d_fkx[i]*d_fkx[i] + d_fky[j]*d_fky[j] + d_fkz[k]*d_fkz[k];
+  if (sqk == 0.0) {
+    d_vg(n,0) = 0.0;
+    d_vg(n,1) = 0.0;
+    d_vg(n,2) = 0.0;
+    d_vg(n,3) = 0.0;
+    d_vg(n,4) = 0.0;
+    d_vg(n,5) = 0.0;
+  } else {
+    const double vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+    d_vg(n,0) = 1.0 + vterm*d_fkx[i]*d_fkx[i];
+    d_vg(n,1) = 1.0 + vterm*d_fky[j]*d_fky[j];
+    d_vg(n,2) = 1.0 + vterm*d_fkz[k]*d_fkz[k];
+    d_vg(n,3) = vterm*d_fkx[i]*d_fky[j];
+    d_vg(n,4) = vterm*d_fkx[i]*d_fkz[k];
+    d_vg(n,5) = vterm*d_fky[j]*d_fkz[k];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::setup_triclinic()
+{
+//  int i,j,k,n;
+//  double *prd;
+//
+//  // volume-dependent factors
+//  // adjust z dimension for 2d slab PPPM
+//  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+//
+//  prd = domain->prd;
+//
+//  double xprd = prd[0];
+//  double yprd = prd[1];
+//  double zprd = prd[2];
+//  double zprd_slab = zprd*slab_volfactor;
+//  volume = xprd * yprd * zprd_slab;
+//
+//  // use lamda (0-1) coordinates
+//
+//  delxinv = nx_pppm;
+//  delyinv = ny_pppm;
+//  delzinv = nz_pppm;
+//  delvolinv = delxinv*delyinv*delzinv/volume;
+//
+//  // d_fkx,d_fky,d_fkz for my FFT grid pts
+//
+//  double per_i,per_j,per_k;
+//
+//  n = 0;
+//  for (k = nzlo_fft; k <= nzhi_fft; k++) { // parallel_for
+//    per_k = k - nz_pppm*(2*k/nz_pppm);
+//    for (j = nylo_fft; j <= nyhi_fft; j++) {
+//      per_j = j - ny_pppm*(2*j/ny_pppm);
+//      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+//        per_i = i - nx_pppm*(2*i/nx_pppm);
+//
+//        double unitk_lamda[3];
+//        unitk_lamda[0] = 2.0*MY_PI*per_i;
+//        unitk_lamda[1] = 2.0*MY_PI*per_j;
+//        unitk_lamda[2] = 2.0*MY_PI*per_k;
+//        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+//        d_fkx[n] = unitk_lamda[0];
+//        d_fky[n] = unitk_lamda[1];
+//        d_fkz[n] = unitk_lamda[2];
+//        n++;
+//      }
+//    }
+//  }
+//
+//  // virial coefficients
+//
+//  double sqk,vterm;
+//
+//  for (n = 0; n < nfft; n++) { // parallel_for
+//    sqk = d_fkx[n]*d_fkx[n] + d_fky[n]*d_fky[n] + d_fkz[n]*d_fkz[n];
+//    if (sqk == 0.0) {
+//      d_vg(n,0) = 0.0;
+//      d_vg(n,1) = 0.0;
+//      d_vg(n,2) = 0.0;
+//      d_vg(n,3) = 0.0;
+//      d_vg(n,4) = 0.0;
+//      d_vg(n,5) = 0.0;
+//    } else {
+//      vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+//      d_vg(n,0) = 1.0 + vterm*d_fkx[n]*d_fkx[n];
+//      d_vg(n,1) = 1.0 + vterm*d_fky[n]*d_fky[n];
+//      d_vg(n,2) = 1.0 + vterm*d_fkz[n]*d_fkz[n];
+//      d_vg(n,3) = vterm*d_fkx[n]*d_fky[n];
+//      d_vg(n,4) = vterm*d_fkx[n]*d_fkz[n];
+//      d_vg(n,5) = vterm*d_fky[n]*d_fkz[n];
+//    }
+//  }
+//
+//  compute_gf_ik_triclinic();
+}
+
+/* ----------------------------------------------------------------------
+   reset local grid arrays and communication stencils
+   called by fix balance b/c it changed sizes of processor sub-domains
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::setup_grid()
+{
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // reset portion of global grid that each proc owns
+
+  set_grid_local();
+
+  // reallocate K-space dependent memory
+  // check if grid communication is now overlapping if not allowed
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+
+  cg->ghost_notify();
+  if (overlap_allowed == 0 && cg->ghost_overlap())
+    error->all(FLERR,"PPPM grid stencil extends "
+               "beyond nearest neighbor processor");
+  cg->setup();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+
+  // pre-compute volume-dependent coeffs
+
+  setup();
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPM long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    cg_peratom->ghost_notify();
+    cg_peratom->setup();
+  }
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  q = atomKK->k_q.view<DeviceType>();
+ 
+  atomKK->sync(execution_space,datamask_read);
+  atomKK->modified(execution_space,datamask_modify);
+ 
+  //nlocal = atomKK->nlocal;
+  //nall = atomKK->nlocal + atomKK->nghost;
+  //newton_pair = force->newton_pair;
+
+  // if atomKK count has changed, update qsum and qsqsum
+
+  if (atomKK->natoms != natoms_original) {
+    qsum_qsq();
+    natoms_original = atomKK->natoms;
+  }
+
+  // return if there are no charges
+
+  if (qsqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  if (triclinic == 0) {
+    boxlo[0] = domain->boxlo[0];
+    boxlo[1] = domain->boxlo[1];
+    boxlo[2] = domain->boxlo[2];
+  } else {
+    boxlo[0] = domain->boxlo_lamda[0];
+    boxlo[1] = domain->boxlo_lamda[1];
+    boxlo[2] = domain->boxlo_lamda[2];
+    domain->x2lamda(atomKK->nlocal);
+  }
+
+  // extend size of per-atomKK arrays if necessary
+
+  if (atomKK->nmax > nmax) {
+    //memory->destroy(part2grid);
+    nmax = atomKK->nmax;
+    //memory->create(part2grid,nmax,3,"pppm:part2grid");
+    d_part2grid = typename AT::t_int_1d_3("pppm:part2grid",nmax);
+    d_rho1d = typename AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1);
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid
+
+  particle_map();
+  make_rho();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  cg->reverse_comm(this,REVERSE_RHO);
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atomKK calculations via poisson_peratom()
+
+  poisson();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  cg->forward_comm(this,FORWARD_IK);
+
+  // extra per-atomKK energy/virial communication
+
+  if (evflag_atom)
+      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
+
+  // calculate the force on my particles
+
+  fieldforce();
+
+  // extra per-atomKK energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  qscale = qqrd2e * scale;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/MY_PIS +
+      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+  }
+
+  // per-atomKK energy/virial
+  // energy includes self-energy correction
+  // notal accounts for TIP4P tallying d_eatom/vatom for ghost atoms
+
+  if (evflag_atom) {
+    int nlocal = atomKK->nlocal;
+    int ntotal = nlocal;
+    //if (tip4pflag) ntotal += atomKK->nghost;
+
+    if (eflag_atom) {
+      copymode = 1;
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_self1>(0,nlocal),*this);
+      DeviceType::fence();
+      copymode = 0;
+      //for (i = nlocal; i < ntotal; i++) d_eatom[i] *= 0.5*qscale;
+    }
+
+    if (vflag_atom) {
+      copymode = 1;
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_self2>(0,ntotal),*this);
+      DeviceType::fence();
+      copymode = 0;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atomKK->nlocal);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_self1, const int &i) const
+{
+  d_eatom[i] *= 0.5;
+  d_eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
+    (g_ewald*g_ewald*volume);
+  d_eatom[i] *= qscale;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_self2, const int &i) const
+{
+  for (int j = 0; j < 6; j++) d_vatom(i,j) *= 0.5*qscale;
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::allocate()
+{
+  d_density_brick = typename AT::t_FFT_SCALAR_3d("pppm:density_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+
+  memory->create_kokkos(k_density_fft,density_fft,nfft_both,"pppm:d_density_fft");
+  d_density_fft = k_density_fft.view<DeviceType>();
+
+  d_greensfn = typename AT::t_float_1d("pppm:greensfn",nfft_both);
+  memory->create_kokkos(k_work1,work1,2*nfft_both,"pppm:work1");
+  memory->create_kokkos(k_work2,work2,2*nfft_both,"pppm:work2");
+  d_work1 = k_work1.view<DeviceType>();
+  d_work2 = k_work2.view<DeviceType>();
+  d_vg = typename AT::t_virial_array("pppm:vg",nfft_both);
+
+  if (triclinic == 0) {
+    d_fkx = typename AT::t_float_1d("pppm:d_fkx",nxhi_fft-nxlo_fft+1);
+    d_fky = typename AT::t_float_1d("pppm:d_fky",nyhi_fft-nylo_fft+1);
+    d_fkz = typename AT::t_float_1d("pppm:d_fkz",nzhi_fft-nzlo_fft+1);
+  } else {
+    d_fkx = typename AT::t_float_1d("pppm:d_fkx",nfft_both);
+    d_fky = typename AT::t_float_1d("pppm:d_fky",nfft_both);
+    d_fkz = typename AT::t_float_1d("pppm:d_fkz",nfft_both);
+  }
+
+  d_vdx_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdx_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+  d_vdy_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdy_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+  d_vdz_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_vdz_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+
+  // summation coeffs
+
+  order_allocated = order;
+  k_gf_b = typename DAT::tdual_float_1d("pppm:gf_b",order);
+  d_gf_b = k_gf_b.view<DeviceType>();
+  d_rho1d = typename AT::t_FFT_SCALAR_2d_3("pppm:rho1d",nmax,order/2+order/2+1);
+  k_rho_coeff = DAT::tdual_FFT_SCALAR_2d("pppm:rho_coeff",order,order/2-(1-order)/2+1);
+  d_rho_coeff = k_rho_coeff.view<DeviceType>();
+  h_rho_coeff = k_rho_coeff.h_view;
+
+  // create 2 FFTs and a Remap
+  // 1st FFT keeps data in FFT decompostion
+  // 2nd FFT returns data in 3d brick decomposition
+  // remap takes data from 3d brick to FFT decomposition
+
+  int tmp;
+
+  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   0,0,&tmp,collective_flag);
+
+  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                   0,0,&tmp,collective_flag);
+  remap = new Remap(lmp,world,
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                    1,0,0,FFT_PRECISION,collective_flag);
+
+  // create ghost grid object for rho and electric field communication
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  cg = new GridCommKokkos<DeviceType>(lmp,world,3,1,
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                    procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                    procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+}
+
+/* ----------------------------------------------------------------------
+   deallocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::deallocate()
+{
+  memory->destroy_kokkos(d_density_fft,density_fft);
+  density_fft = NULL;
+  memory->destroy_kokkos(d_greensfn,greensfn);
+  greensfn = NULL;
+  memory->destroy_kokkos(d_work1,work1);
+  work1 = NULL;
+  memory->destroy_kokkos(d_work2,work2);
+  work2 = NULL;
+
+  delete fft1;
+  fft1 = NULL;
+  delete fft2;
+  fft2 = NULL;
+  delete remap;
+  remap = NULL;
+  delete cg;
+  cg = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   allocate per-atomKK memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::allocate_peratom()
+{
+  peratom_allocate_flag = 1;
+
+  d_u_brick = typename AT::t_FFT_SCALAR_3d("pppm:u_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+
+  d_v0_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v0_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+  d_v1_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v1_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+  d_v2_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v2_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+  d_v3_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v3_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+  d_v4_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v4_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+  d_v5_brick = typename AT::t_FFT_SCALAR_3d("pppm:d_v5_brick",nzhi_out-nzlo_out+1,nyhi_out-nylo_out+1,nxhi_out-nxlo_out+1);
+
+
+  // create ghost grid object for rho and electric field communication
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  cg_peratom =
+    new GridCommKokkos<DeviceType>(lmp,world,7,1,
+                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+}
+
+/* ----------------------------------------------------------------------
+   deallocate per-atomKK memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::deallocate_peratom()
+{
+  peratom_allocate_flag = 0;
+
+  delete cg_peratom;
+  cg_peratom = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   set global size of PPPM grid = nx,ny,nz_pppm
+   used for charge accumulation, FFTs, and electric field interpolation
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::set_grid_global()
+{
+  // use xprd,yprd,zprd (even if triclinic, and then scale later)
+  // adjust z dimension for 2d slab PPPM
+  // 3d PPPM just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  double h;
+  bigint natoms = atomKK->natoms;
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2);
+    if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    else g_ewald = sqrt(-log(g_ewald)) / cutoff;
+  }
+
+  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
+  // nz_pppm uses extended zprd_slab instead of zprd
+  // reduce it until accuracy target is met
+
+  if (!gridflag) {
+
+    double err;
+    h_x = h_y = h_z = 1.0/g_ewald;
+
+    nx_pppm = static_cast<int> (xprd/h_x) + 1;
+    ny_pppm = static_cast<int> (yprd/h_y) + 1;
+    nz_pppm = static_cast<int> (zprd_slab/h_z) + 1;
+
+    err = estimate_ik_error(h_x,xprd,natoms);
+    while (err > accuracy) {
+      err = estimate_ik_error(h_x,xprd,natoms);
+      nx_pppm++;
+      h_x = xprd/nx_pppm;
+    }
+
+    err = estimate_ik_error(h_y,yprd,natoms);
+    while (err > accuracy) {
+      err = estimate_ik_error(h_y,yprd,natoms);
+      ny_pppm++;
+      h_y = yprd/ny_pppm;
+    }
+
+    err = estimate_ik_error(h_z,zprd_slab,natoms);
+    while (err > accuracy) {
+      err = estimate_ik_error(h_z,zprd_slab,natoms);
+      nz_pppm++;
+      h_z = zprd_slab/nz_pppm;
+    }
+
+    // scale grid for triclinic skew
+
+    if (triclinic) {
+      double tmp[3];
+      tmp[0] = nx_pppm/xprd;
+      tmp[1] = ny_pppm/yprd;
+      tmp[2] = nz_pppm/zprd;
+      lamda2xT(&tmp[0],&tmp[0]);
+      nx_pppm = static_cast<int>(tmp[0]) + 1;
+      ny_pppm = static_cast<int>(tmp[1]) + 1;
+      nz_pppm = static_cast<int>(tmp[2]) + 1;
+    }
+  }
+
+  // boost grid size until it is factorable
+
+  while (!factorable(nx_pppm)) nx_pppm++;
+  while (!factorable(ny_pppm)) ny_pppm++;
+  while (!factorable(nz_pppm)) nz_pppm++;
+
+  if (triclinic == 0) {
+    h_x = xprd/nx_pppm;
+    h_y = yprd/ny_pppm;
+    h_z = zprd_slab/nz_pppm;
+  } else {
+    double tmp[3];
+    tmp[0] = nx_pppm;
+    tmp[1] = ny_pppm;
+    tmp[2] = nz_pppm;
+    x2lamdaT(&tmp[0],&tmp[0]);
+    h_x = 1.0/tmp[0];
+    h_y = 1.0/tmp[1];
+    h_z = 1.0/tmp[2];
+  }
+
+  if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
+    error->all(FLERR,"PPPM grid is too large");
+}
+
+/* ----------------------------------------------------------------------
+   check if all factors of n are in list of factors
+   return 1 if yes, 0 if no
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PPPMKokkos<DeviceType>::factorable(int n)
+{
+  int i;
+
+  while (n > 1) {
+    for (i = 0; i < nfactors; i++) {
+      if (n % factors[i] == 0) {
+        n /= factors[i];
+        break;
+      }
+    }
+    if (i == nfactors) return 0;
+  }
+
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   compute estimated kspace force error
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PPPMKokkos<DeviceType>::compute_df_kspace()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+  bigint natoms = atomKK->natoms;
+  double df_kspace = 0.0;
+  double lprx = estimate_ik_error(h_x,xprd,natoms);
+  double lpry = estimate_ik_error(h_y,yprd,natoms);
+  double lprz = estimate_ik_error(h_z,zprd_slab,natoms);
+  df_kspace = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  return df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   estimate kspace force error for ik method
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PPPMKokkos<DeviceType>::estimate_ik_error(double h, double prd, bigint natoms)
+{
+  double sum = 0.0;
+  for (int m = 0; m < order; m++)
+    sum += acons(order,m) * pow(h*g_ewald,2.0*m);
+  double value = q2 * pow(h*g_ewald,(double)order) *
+    sqrt(g_ewald*prd*sqrt(MY_2PI)*sum/natoms) / (prd*prd);
+
+  return value;
+}
+
+/* ----------------------------------------------------------------------
+   adjust the g_ewald parameter to near its optimal value
+   using a Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::adjust_gewald()
+{
+  double dx;
+
+  for (int i = 0; i < LARGE; i++) {
+    dx = newton_raphson_f() / derivf();
+    g_ewald -= dx;
+    if (fabs(newton_raphson_f()) < SMALL) return;
+  }
+
+  char str[128];
+  sprintf(str, "Could not compute g_ewald");
+  error->all(FLERR, str);
+}
+
+/* ----------------------------------------------------------------------
+   calculate f(x) using Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PPPMKokkos<DeviceType>::newton_raphson_f()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atomKK->natoms;
+
+  double df_rspace = 2.0*q2*exp(-g_ewald*g_ewald*cutoff*cutoff) /
+       sqrt(natoms*cutoff*xprd*yprd*zprd);
+
+  double df_kspace = compute_df_kspace();
+
+  return df_rspace - df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   calculate numerical derivative f'(x) using forward difference
+   [f(x + h) - f(x)] / h
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PPPMKokkos<DeviceType>::derivf()
+{
+  double h = 0.000001;  //Derivative step-size
+  double df,f1,f2,g_ewald_old;
+
+  f1 = newton_raphson_f();
+  g_ewald_old = g_ewald;
+  g_ewald += h;
+  f2 = newton_raphson_f();
+  g_ewald = g_ewald_old;
+  df = (f2 - f1)/h;
+
+  return df;
+}
+
+/* ----------------------------------------------------------------------
+   calculate the final estimate of the accuracy
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PPPMKokkos<DeviceType>::final_accuracy()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atomKK->natoms;
+
+  double df_kspace = compute_df_kspace();
+  double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd);
+  double df_rspace = 2.0 * q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
+  double df_table = estimate_table_accuracy(q2_over_sqrt,df_rspace);
+  double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace +
+                                   df_table*df_table);
+
+  return estimated_accuracy;
+}
+
+/* ----------------------------------------------------------------------
+   set local subset of PPPM/FFT grid that I own
+   n xyz lo/hi in = 3d brick that I own (inclusive)
+   n xyz lo/hi out = 3d brick + ghost cells in 6 directions (inclusive)
+   n xyz lo/hi fft = FFT columns that I own (all of x dim, 2d decomp in yz)
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::set_grid_local()
+{
+  // global indices of PPPM grid range from 0 to N-1
+  // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
+  //   global PPPM grid that I own without ghost cells
+  // for slab PPPM, assign z grid as if it were not extended
+
+  nxlo_in = static_cast<int> (comm->xsplit[comm->myloc[0]] * nx_pppm);
+  nxhi_in = static_cast<int> (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1;
+
+  nylo_in = static_cast<int> (comm->ysplit[comm->myloc[1]] * ny_pppm);
+  nyhi_in = static_cast<int> (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1;
+
+  nzlo_in = static_cast<int>
+      (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor);
+  nzhi_in = static_cast<int>
+      (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1;
+
+  // nlower,nupper = stencil size for mapping particles to PPPM grid
+
+  nlower = -(order-1)/2;
+  nupper = order/2;
+
+  // shift values for particle <-> grid mapping
+  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+  if (order % 2) shift = OFFSET + 0.5;
+  else shift = OFFSET;
+  if (order % 2) shiftone = 0.0;
+  else shiftone = 0.5;
+
+  // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
+  //   global PPPM grid that my particles can contribute charge to
+  // effectively nlo_in,nhi_in + ghost cells
+  // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
+  //           position a particle in my box can be at
+  // dist[3] = particle position bound = subbox + skin/2.0 + qdist
+  //   qdist = offset due to TIP4P fictitious charge
+  //   convert to triclinic if necessary
+  // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
+  // for slab PPPM, assign z grid as if it were not extended
+
+  double *prd,*sublo,*subhi;
+
+  if (triclinic == 0) {
+    prd = domain->prd;
+    boxlo[0] = domain->boxlo[0];
+    boxlo[1] = domain->boxlo[1];
+    boxlo[2] = domain->boxlo[2];
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    prd = domain->prd_lamda;
+    boxlo[0] = domain->boxlo_lamda[0];
+    boxlo[1] = domain->boxlo_lamda[1];
+    boxlo[2] = domain->boxlo_lamda[2];
+    domain->x2lamda(atomKK->nlocal);
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  double dist[3];
+  double cuthalf = 0.5*neighbor->skin + qdist;
+  if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
+  else kspacebbox(cuthalf,&dist[0]);
+
+  int nlo,nhi;
+
+  nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) *
+                            nx_pppm/xprd + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) *
+                            nx_pppm/xprd + shift) - OFFSET;
+  nxlo_out = nlo + nlower;
+  nxhi_out = nhi + nupper;
+
+  nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) *
+                            ny_pppm/yprd + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) *
+                            ny_pppm/yprd + shift) - OFFSET;
+  nylo_out = nlo + nlower;
+  nyhi_out = nhi + nupper;
+
+  nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) *
+                            nz_pppm/zprd_slab + shift) - OFFSET;
+  nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) *
+                            nz_pppm/zprd_slab + shift) - OFFSET;
+  nzlo_out = nlo + nlower;
+  nzhi_out = nhi + nupper;
+
+  // for slab PPPM, change the grid boundary for processors at +z end
+  //   to include the empty volume between periodically repeating slabs
+  // for slab PPPM, want charge data communicated from -z proc to +z proc,
+  //   but not vice versa, also want field data communicated from +z proc to
+  //   -z proc, but not vice versa
+  // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
+  // also insure no other procs use ghost cells beyond +z limit
+
+  if (slabflag == 1) {
+    if (comm->myloc[2] == comm->procgrid[2]-1)
+      nzhi_in = nzhi_out = nz_pppm - 1;
+    nzhi_out = MIN(nzhi_out,nz_pppm-1);
+  }
+
+  // decomposition of FFT mesh
+  // global indices range from 0 to N-1
+  // proc owns entire x-dimension, clumps of columns in y,z dimensions
+  // npey_fft,npez_fft = # of procs in y,z dims
+  // if nprocs is small enough, proc can own 1 or more entire xy planes,
+  //   else proc owns 2d sub-blocks of yz plane
+  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
+  // nlo_fft,nhi_fft = lower/upper limit of the section
+  //   of the global FFT mesh that I own
+
+  int npey_fft,npez_fft;
+  if (nz_pppm >= nprocs) {
+    npey_fft = 1;
+    npez_fft = nprocs;
+  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+  int me_y = me % npey_fft;
+  int me_z = me / npey_fft;
+
+  nxlo_fft = 0;
+  nxhi_fft = nx_pppm - 1;
+  nylo_fft = me_y*ny_pppm/npey_fft;
+  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
+  nzlo_fft = me_z*nz_pppm/npez_fft;
+  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+
+  // PPPM grid pts owned by this proc, including ghosts
+
+  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+
+  // FFT grids owned by this proc, without ghosts
+  // nfft = FFT points in FFT decomposition on this proc
+  // nfft_brick = FFT points in 3d brick-decomposition on this proc
+  // nfft_both = greater of 2 values
+
+  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
+    (nzhi_fft-nzlo_fft+1);
+  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
+    (nzhi_in-nzlo_in+1);
+  nfft_both = MAX(nfft,nfft_brick);
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::compute_gf_denom()
+{
+  int k,l,m;
+
+  for (l = 1; l < order; l++) k_gf_b.h_view[l] = 0.0;
+  k_gf_b.h_view[0] = 1.0;
+
+  for (m = 1; m < order; m++) {
+    for (l = m; l > 0; l--)
+      k_gf_b.h_view[l] = 4.0 * (k_gf_b.h_view[l]*(l-m)*(l-m-0.5)-k_gf_b.h_view[l-1]*(l-m-1)*(l-m-1));
+    k_gf_b.h_view[0] = 4.0 * (k_gf_b.h_view[0]*(l-m)*(l-m-0.5));
+  }
+
+  bigint ifact = 1;
+  for (k = 1; k < 2*order; k++) ifact *= k;
+  double gaminv = 1.0/ifact;
+  for (l = 0; l < order; l++) k_gf_b.h_view[l] *= gaminv;
+
+  k_gf_b.template modify<LMPHostType>();
+  k_gf_b.template sync<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::compute_gf_ik()
+{
+  const double * const prd = domain->prd;
+
+  xprd = prd[0];
+  yprd = prd[1];
+  const double zprd = prd[2];
+  zprd_slab = zprd*slab_volfactor;
+  unitkx = (MY_2PI/xprd);
+  unitky = (MY_2PI/yprd);
+  unitkz = (MY_2PI/zprd_slab);
+
+  nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                          pow(-log(EPS_HOC),0.25));
+  nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                          pow(-log(EPS_HOC),0.25));
+  nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                          pow(-log(EPS_HOC),0.25));
+  twoorder = 2*order;
+
+  // merge three outer loops into one for better threading
+
+  numz_fft = nzhi_fft-nzlo_fft + 1;
+  numy_fft = nyhi_fft-nylo_fft + 1;
+  numx_fft = nxhi_fft-nxlo_fft + 1;
+  const int inum_fft = numz_fft*numy_fft*numx_fft;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_compute_gf_ik>(0,inum_fft),*this);
+  DeviceType::fence();
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik, const int &n) const
+{
+  int m = n/(numy_fft*numx_fft);
+  int l = (n - m*numy_fft*numx_fft) / numx_fft;
+  int k = n - m*numy_fft*numx_fft - l*numx_fft;
+  m += nzlo_fft;
+  l += nylo_fft;
+  k += nxlo_fft;
+
+  const int mper = m - nz_pppm*(2*m/nz_pppm);
+  const double snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+
+  const int lper = l - ny_pppm*(2*l/ny_pppm);
+  const double sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+
+  const int kper = k - nx_pppm*(2*k/nx_pppm);
+  const double snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+
+  const double sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+  if (sqk != 0.0) {
+    const double numerator = 12.5663706/sqk;
+    const double denominator = gf_denom(snx,sny,snz);
+    double sum1 = 0.0;
+
+    for (int nx = -nbx; nx <= nbx; nx++) {
+      const double qx = unitkx*(kper+nx_pppm*nx);
+      const double sx = exp(-0.25*square(qx/g_ewald));
+      const double argx = 0.5*qx*xprd/nx_pppm;
+      const double wx = powsinxx(argx,twoorder);
+
+      for (int ny = -nby; ny <= nby; ny++) {
+        const double qy = unitky*(lper+ny_pppm*ny);
+        const double sy = exp(-0.25*square(qy/g_ewald));
+        const double argy = 0.5*qy*yprd/ny_pppm;
+        const double wy = powsinxx(argy,twoorder);
+
+        for (int nz = -nbz; nz <= nbz; nz++) {
+          const double qz = unitkz*(mper+nz_pppm*nz);
+          const double sz = exp(-0.25*square(qz/g_ewald));
+          const double argz = 0.5*qz*zprd_slab/nz_pppm;
+          const double wz = powsinxx(argz,twoorder);
+
+          const double dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+          const double dot2 = qx*qx+qy*qy+qz*qz;
+          sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+        }
+      }
+    }
+    d_greensfn[n] = numerator*sum1/denominator;
+  } else d_greensfn[n] = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::compute_gf_ik_triclinic()
+{
+  double tmp[3];
+  tmp[0] = (g_ewald/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[1] = (g_ewald/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[2] = (g_ewald/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25);
+  lamda2xT(&tmp[0],&tmp[0]);
+  nbx = static_cast<int> (tmp[0]);
+  nby = static_cast<int> (tmp[1]);
+  nbz = static_cast<int> (tmp[2]);
+
+  twoorder = 2*order;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_compute_gf_ik_triclinic>(nzlo_fft,nzhi_fft+1),*this);
+  DeviceType::fence();
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik_triclinic, const int &m) const
+{
+  //int n = (m - nzlo_fft)*(nyhi_fft+1 - nylo_fft)*(nxhi_fft+1 - nxlo_fft);
+  //
+  //const int mper = m - nz_pppm*(2*m/nz_pppm);
+  //const double snz = square(sin(MY_PI*mper/nz_pppm));
+  //
+  //for (int l = nylo_fft; l <= nyhi_fft; l++) {
+  //  const int lper = l - ny_pppm*(2*l/ny_pppm);
+  //  const double sny = square(sin(MY_PI*lper/ny_pppm));
+  //
+  //  for (int k = nxlo_fft; k <= nxhi_fft; k++) {
+  //    const int kper = k - nx_pppm*(2*k/nx_pppm);
+  //    const double snx = square(sin(MY_PI*kper/nx_pppm));
+  //
+  //    double unitk_lamda[3];
+  //    unitk_lamda[0] = 2.0*MY_PI*kper;
+  //    unitk_lamda[1] = 2.0*MY_PI*lper;
+  //    unitk_lamda[2] = 2.0*MY_PI*mper;
+  //    x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+  //
+  //    const double sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
+  //
+  //    if (sqk != 0.0) {
+  //      const double numerator = 12.5663706/sqk;
+  //      const double denominator = gf_denom(snx,sny,snz);
+  //      double sum1 = 0.0;
+  //
+  //      for (int nx = -nbx; nx <= nbx; nx++) {
+  //        const double argx = MY_PI*kper/nx_pppm + MY_PI*nx;
+  //        const double wx = powsinxx(argx,twoorder);
+  //
+  //        for (int ny = -nby; ny <= nby; ny++) {
+  //          const double argy = MY_PI*lper/ny_pppm + MY_PI*ny;
+  //          const double wy = powsinxx(argy,twoorder);
+  //
+  //          for (int nz = -nbz; nz <= nbz; nz++) {
+  //            const double argz = MY_PI*mper/nz_pppm + MY_PI*nz;
+  //            const double wz = powsinxx(argz,twoorder);
+  //
+  //            double b[3];
+  //            b[0] = 2.0*MY_PI*nx_pppm*nx;
+  //            b[1] = 2.0*MY_PI*ny_pppm*ny;
+  //            b[2] = 2.0*MY_PI*nz_pppm*nz;
+  //            x2lamdaT(&b[0],&b[0]);
+  //
+  //            const double qx = unitk_lamda[0]+b[0];
+  //            const double sx = exp(-0.25*square(qx/g_ewald));
+  //
+  //            const double qy = unitk_lamda[1]+b[1];
+  //            const double sy = exp(-0.25*square(qy/g_ewald));
+  //
+  //            const double qz = unitk_lamda[2]+b[2];
+  //            const double sz = exp(-0.25*square(qz/g_ewald));
+  //
+  //            const double dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
+  //            const double dot2 = qx*qx+qy*qy+qz*qz;
+  //            sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+  //          }
+  //        }
+  //      }
+  //      d_greensfn[n++] = numerator*sum1/denominator;
+  //    } else d_greensfn[n++] = 0.0;
+  //  }
+  //}
+}
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::particle_map()
+{
+  int nlocal = atomKK->nlocal;
+
+  k_flag.h_view() = 0;
+  k_flag.template modify<LMPHostType>();
+  k_flag.template sync<DeviceType>();
+
+  if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
+    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_particle_map>(0,nlocal),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_flag.template modify<DeviceType>();
+  k_flag.template sync<LMPHostType>();
+  if (k_flag.h_view()) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_particle_map, const int &i) const
+{
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // current particle coord can be outside global and local box
+  // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+  const int nx = static_cast<int> ((x(i,0)-boxlo[0])*delxinv+shift) - OFFSET;
+  const int ny = static_cast<int> ((x(i,1)-boxlo[1])*delyinv+shift) - OFFSET;
+  const int nz = static_cast<int> ((x(i,2)-boxlo[2])*delzinv+shift) - OFFSET;
+
+  d_part2grid(i,0) = nx;
+  d_part2grid(i,1) = ny;
+  d_part2grid(i,2) = nz;
+
+  // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+  if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+      ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+      nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+    k_flag.view<DeviceType>()() = 1;
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::make_rho()
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+
+  // clear 3d density array
+
+  //memset(&(density_brick(nzlo_out,nylo_out,nxlo_out)),0,
+  //       ngrid*sizeof(FFT_SCALAR));
+  numz_out = nzhi_out-nzlo_out + 1;
+  numy_out = nyhi_out-nylo_out + 1;
+  numx_out = nxhi_out-nxlo_out + 1;
+  const int inum_out = numz_out*numy_out*numx_out;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_make_rho_zero>(0,inum_out),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  nlocal = atomKK->nlocal;
+
+#ifdef KOKKOS_HAVE_CUDA
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_make_rho_atomic>(0,nlocal),*this);
+  DeviceType::fence();
+  copymode = 0;
+#else
+  ix = nxhi_out-nxlo_out + 1;
+  iy = nyhi_out-nylo_out + 1;
+
+  copymode = 1;
+  Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho> config(lmp->kokkos->num_threads,1);
+  Kokkos::parallel_for(config,*this);
+  DeviceType::fence();
+  copymode = 0;
+#endif
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_make_rho_zero, const int &ii) const
+{
+  int iz = ii/(numy_out*numx_out);
+  int iy = (ii - iz*numy_out*numx_out) / numx_out;
+  int ix = ii - iz*numy_out*numx_out - iy*numx_out;
+  d_density_brick(iz,iy,ix) = 0.0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_make_rho_atomic, const int &i) const
+{
+  // The density_brick array is atomic for Half/Thread neighbor style
+  Kokkos::View<FFT_SCALAR***,Kokkos::LayoutRight,DeviceType,Kokkos::MemoryTraits<Kokkos::Atomic|Kokkos::Unmanaged> > a_density_brick = d_density_brick;
+
+  int nx = d_part2grid(i,0);
+  int ny = d_part2grid(i,1);
+  int nz = d_part2grid(i,2);
+  const FFT_SCALAR dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv;
+  const FFT_SCALAR dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv;
+  const FFT_SCALAR dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv;
+
+
+  nz -= nzlo_out;
+  ny -= nylo_out;
+  nx -= nxlo_out;
+
+  compute_rho1d(i,dx,dy,dz);
+
+  const FFT_SCALAR z0 = delvolinv * q[i];
+  for (int n = nlower; n <= nupper; n++) {
+    const int mz = n+nz;
+    const FFT_SCALAR y0 = z0*d_rho1d(i,n+order/2,2);
+    for (int m = nlower; m <= nupper; m++) {
+      const int my = m+ny;
+      const FFT_SCALAR x0 = y0*d_rho1d(i,m+order/2,1);
+      for (int l = nlower; l <= nupper; l++) {
+        const int mx = l+nx;
+        a_density_brick(mz,my,mx) += x0*d_rho1d(i,l+order/2,0);
+      }
+    }
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator() (TagPPPM_make_rho, typename Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho>::member_type dev) const {
+  // adapted from USER-OMP/pppm.cpp:
+
+  // determine range of grid points handled by this thread
+  int tid = dev.league_rank();
+  // each thread works on a fixed chunk of grid points
+  const int nthreads = dev.league_size();
+  const int idelta = 1 + ngrid/nthreads;
+  int ifrom = tid*idelta;
+  int ito = ((ifrom + idelta) > ngrid) ? ngrid : ifrom + idelta;
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+
+  // loop over all local atoms for all threads
+  for (int i = 0; i < nlocal; i++) {
+
+    int nx = d_part2grid(i,0);
+    int ny = d_part2grid(i,1);
+    int nz = d_part2grid(i,2);
+
+    // pre-screen whether this atom will ever come within
+    // reach of the data segement this thread is updating.
+    if ( ((nz+nlower-nzlo_out)*ix*iy >= ito)
+         || ((nz+nupper-nzlo_out+1)*ix*iy < ifrom) ) continue;
+
+    const FFT_SCALAR dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv;
+    const FFT_SCALAR dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv;
+    const FFT_SCALAR dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv;
+
+    nz -= nzlo_out;
+    ny -= nylo_out;
+    nx -= nxlo_out;
+
+    compute_rho1d(i,dx,dy,dz);
+
+    const FFT_SCALAR z0 = delvolinv * q[i];
+    for (int n = nlower; n <= nupper; n++) {
+      const int mz = n+nz;
+      const int in = mz*ix*iy;
+      const FFT_SCALAR y0 = z0*d_rho1d(i,n+order/2,2);
+      for (int m = nlower; m <= nupper; m++) {
+        const int my = m+ny;
+        const int im = in+my*ix;
+        const FFT_SCALAR x0 = y0*d_rho1d(i,m+order/2,1);
+        for (int l = nlower; l <= nupper; l++) {
+          const int mx = l+nx;
+          const int il = im+mx;
+          // make sure each thread only updates
+          // their elements of the density grid
+          if (il >= ito) break;
+          if (il < ifrom) continue;
+          d_density_brick(mz,my,mx) += x0*d_rho1d(i,l+order/2,0);
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   remap density from 3d brick decomposition to FFT decomposition
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::brick2fft()
+{
+  // copy grabs inner portion of density from 3d brick
+  // remap could be done as pre-stage of FFT,
+  //   but this works optimally on only double values, not complex values
+
+  numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1;
+  numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1;
+  numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1;
+  const int inum_inout = numz_inout*numy_inout*numx_inout;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_brick2fft>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_density_fft.template modify<DeviceType>();
+  k_density_fft.template sync<LMPHostType>();
+  remap->perform(density_fft,density_fft,work1);
+  k_density_fft.template modify<LMPHostType>();
+  k_density_fft.template sync<DeviceType>();
+
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_brick2fft, const int &ii) const
+{
+  const int n = ii;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_density_fft[n] = d_density_brick(k,j,i);
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::poisson()
+{
+  poisson_ik();
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::poisson_ik()
+{
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k)
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik1>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work1.template modify<DeviceType>();
+  k_work1.template sync<LMPHostType>();
+  fft1->compute(work1,work1,1);
+  k_work1.template modify<LMPHostType>();
+  k_work1.template sync<DeviceType>();
+
+
+  // global energy and virial contribution
+
+  scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    EV_FLOAT ev;
+    if (vflag_global) {
+      copymode = 1;
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik2>(0,nfft),*this,ev);
+      DeviceType::fence();
+      copymode = 0;
+      for (j = 0; j < 6; j++) virial[j] += ev.v[j];
+      energy += ev.ecoul;
+    } else {
+      n = 0;
+      copymode = 1;
+      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik3>(0,nfft),*this,ev);
+      DeviceType::fence();
+      copymode = 0;
+      energy += ev.ecoul;
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik4>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  // extra FFTs for per-atomKK energy/virial
+
+  if (evflag_atom) poisson_peratom();
+
+  // triclinic system
+
+  if (triclinic) {
+    poisson_ik_triclinic();
+    return;
+  }
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // merge three outer loops into one for better threading
+
+  numz_fft = nzhi_fft-nzlo_fft + 1;
+  numy_fft = nyhi_fft-nylo_fft + 1;
+  numx_fft = nxhi_fft-nxlo_fft + 1;
+  const int inum_fft = numz_fft*numy_fft*numx_fft;
+
+  numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1;
+  numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1;
+  numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1;
+  const int inum_inout = numz_inout*numy_inout*numx_inout;
+
+  // x direction gradient
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik5>(0,inum_fft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik6>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+
+  // y direction gradient
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik7>(0,inum_fft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik8>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  // z direction gradient
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik9>(0,inum_fft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik10>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik1, const int &i) const
+{
+  d_work1[2*i] = d_density_fft[i];
+  d_work1[2*i+1] = ZEROF;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik2, const int &i, EV_FLOAT& ev) const
+{
+  const double eng = s2 * d_greensfn[i] * (d_work1[2*i]*d_work1[2*i] + d_work1[2*i+1]*d_work1[2*i+1]);
+  for (int j = 0; j < 6; j++) ev.v[j] += eng*d_vg(i,j);
+  if (eflag_global) ev.ecoul += eng;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik3, const int &i, EV_FLOAT& ev) const
+{
+  ev.ecoul +=
+    s2 * d_greensfn[i] * (d_work1[2*i]*d_work1[2*i] + d_work1[2*i+1]*d_work1[2*i+1]);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik4, const int &i) const
+{
+  d_work1[2*i] *= scaleinv * d_greensfn[i];
+  d_work1[2*i+1] *= scaleinv * d_greensfn[i];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik5, const int &ii) const
+{
+  const int n = ii*2;
+  const int k = ii/(numy_fft*numx_fft);
+  const int j = (ii - k*numy_fft*numx_fft) / numx_fft;
+  const int i = ii - k*numy_fft*numx_fft - j*numx_fft;
+  d_work2[n] = d_fkx[i]*d_work1[n+1];
+  d_work2[n+1] = -d_fkx[i]*d_work1[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik6, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_vdx_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik7, const int &ii) const
+{
+  const int n = ii*2;
+  const int k = ii/(numy_fft*numx_fft);
+  const int j = (ii - k*numy_fft*numx_fft) / numx_fft;
+  d_work2[n] = d_fky[j]*d_work1[n+1];
+  d_work2[n+1] = -d_fky[j]*d_work1[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik8, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_vdy_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik9, const int &ii) const
+{
+  const int n = ii*2;
+  const int k = ii/(numy_fft*numx_fft);
+  d_work2[n] = d_fkz[k]*d_work1[n+1];
+  d_work2[n+1] = -d_fkz[k]*d_work1[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik10, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_vdz_brick(k,j,i) = d_work2[n];
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik for a triclinic system
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::poisson_ik_triclinic()
+{
+//  int i,j,k,n;
+//
+//  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+//  // FFT leaves data in 3d brick decomposition
+//  // copy it into inner portion of vdx,vdy,vdz arrays
+//
+//  // x direction gradient
+//
+//  n = 0;
+//  for (i = 0; i < nfft; i++) { // parallel_for1
+//    d_work2[n] = d_fkx[i]*d_work1[n+1];
+//    d_work2[n+1] = -d_fkx[i]*d_work1[n];
+//    n += 2;
+//  }
+//
+//  fft2->compute(d_work2,d_work2,-1);
+//
+//  n = 0;
+//  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2
+//
+//
+//  // y direction gradient
+//
+//  n = 0;
+//  for (i = 0; i < nfft; i++) { // parallel_for3
+//    d_work2[n] = d_fky[i]*d_work1[n+1];
+//    d_work2[n+1] = -d_fky[i]*d_work1[n];
+//    n += 2;
+//  }
+//
+//  fft2->compute(d_work2,d_work2,-1);
+//
+//  n = 0;
+//  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4
+//    for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
+//      for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
+//        d_vdy_brick(k,j,i) = d_work2[n];
+//        n += 2;
+//      }
+//
+//  // z direction gradient
+//
+//  n = 0;
+//  for (i = 0; i < nfft; i++) { // parallel_for5
+//    d_work2[n] = d_fkz[i]*d_work1[n+1];
+//    d_work2[n+1] = -d_fkz[i]*d_work1[n];
+//    n += 2;
+//  }
+//
+//  fft2->compute(d_work2,d_work2,-1);
+//
+//  n = 0;
+//  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6
+//
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic1, const int &k) const
+{
+
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic2, const int &k) const
+{
+//    for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
+//      for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
+//        d_vdx_brick(k,j,i) = d_work2[n];
+//        n += 2;
+//      }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic3, const int &k) const
+{
+//  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
+
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic4, const int &k) const
+{
+//  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
+//
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic5, const int &k) const
+{
+//  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
+//
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int &k) const
+{
+//  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
+//
+//  for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
+//    for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
+//      d_vdz_brick(k,j,i) = d_work2[n];
+//      n += 2;
+//    }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for per-atom energy/virial
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::poisson_peratom()
+{
+  int i,j,k,n;
+
+  // merge three outer loops into one for better threading
+
+  numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1;
+  numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1;
+  numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1;
+  const int inum_inout = numz_inout*numy_inout*numx_inout;
+
+  // energy
+
+  if (eflag_atom) {
+    copymode = 1;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom1>(0,nfft),*this);
+    DeviceType::fence();
+    copymode = 0;
+
+    k_work2.template modify<DeviceType>();
+    k_work2.template sync<LMPHostType>();
+    fft2->compute(work2,work2,-1);
+    k_work2.template modify<LMPHostType>();
+    k_work2.template sync<DeviceType>();
+
+
+    copymode = 1;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom2>(0,inum_inout),*this);
+    DeviceType::fence();
+    copymode = 0;
+
+  }
+
+  // 6 components of virial in v0 thru v5
+
+  if (!vflag_atom) return;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom3>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom4>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom5>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom6>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom7>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom8>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom9>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom10>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom11>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom12>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom13>(0,nfft),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  k_work2.template modify<DeviceType>();
+  k_work2.template sync<LMPHostType>();
+  fft2->compute(work2,work2,-1);
+  k_work2.template modify<LMPHostType>();
+  k_work2.template sync<DeviceType>();
+
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_peratom14>(0,inum_inout),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom1, const int &i) const
+{
+  int n = 2*i;
+
+  d_work2[n] = d_work1[n];
+  d_work2[n+1] = d_work1[n+1];
+  n += 2;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom2, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_u_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom3, const int &i) const
+{
+  int n = 2*i;
+
+  d_work2[n] = d_work1[n]*d_vg(i,0);
+  d_work2[n+1] = d_work1[n+1]*d_vg(i,0);
+  n += 2;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom4, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_v0_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom5, const int &i) const
+{
+  int n = 2*i;
+
+  d_work2[n] = d_work1[n]*d_vg(i,1);
+  d_work2[n+1] = d_work1[n+1]*d_vg(i,1);
+  n += 2;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom6, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_v1_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom7, const int &i) const
+{
+  int n = 2*i;
+
+  d_work2[n] = d_work1[n]*d_vg(i,2);
+  d_work2[n+1] = d_work1[n+1]*d_vg(i,2);
+  n += 2;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom8, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_v2_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom9, const int &i) const
+{
+  int n = 2*i;
+
+  d_work2[n] = d_work1[n]*d_vg(i,3);
+  d_work2[n+1] = d_work1[n+1]*d_vg(i,3);
+  n += 2;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom10, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_v3_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom11, const int &i) const
+{
+  int n = 2*i;
+
+  d_work2[n] = d_work1[n]*d_vg(i,4);
+  d_work2[n+1] = d_work1[n+1]*d_vg(i,4);
+  n += 2;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom12, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_v4_brick(k,j,i) = d_work2[n];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom13, const int &i) const
+{
+  int n = 2*i;
+
+  d_work2[n] = d_work1[n]*d_vg(i,5);
+  d_work2[n+1] = d_work1[n+1]*d_vg(i,5);
+  n += 2;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_peratom14, const int &ii) const
+{
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_v5_brick(k,j,i) = d_work2[n];
+}
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::fieldforce()
+{
+  fieldforce_ik();
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::fieldforce_ik()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  int nlocal = atomKK->nlocal;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_fieldforce_ik>(0,nlocal),*this);
+  DeviceType::fence();
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_fieldforce_ik, const int &i) const
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ekx,eky,ekz;
+
+  nx = d_part2grid(i,0);
+  ny = d_part2grid(i,1);
+  nz = d_part2grid(i,2);
+  dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv;
+  dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv;
+  dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv;
+
+  nz -= nzlo_out;
+  ny -= nylo_out;
+  nx -= nxlo_out;
+
+  //compute_rho1d(i,dx,dy,dz); // hasn't changed from make_rho
+
+  ekx = eky = ekz = ZEROF;
+  for (n = nlower; n <= nupper; n++) {
+    mz = n+nz;
+    z0 = d_rho1d(i,n+order/2,2);
+    for (m = nlower; m <= nupper; m++) {
+      my = m+ny;
+      y0 = z0*d_rho1d(i,m+order/2,1);
+      for (l = nlower; l <= nupper; l++) {
+        mx = l+nx;
+        x0 = y0*d_rho1d(i,l+order/2,0);
+        ekx -= x0*d_vdx_brick(mz,my,mx);
+        eky -= x0*d_vdy_brick(mz,my,mx);
+        ekz -= x0*d_vdz_brick(mz,my,mx);
+      }
+    }
+  }
+
+  // convert E-field to force
+
+  const double qfactor = qqrd2e * scale * q[i];
+  f(i,0) += qfactor*ekx;
+  f(i,1) += qfactor*eky;
+  if (slabflag != 2) f(i,2) += qfactor*ekz;
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::fieldforce_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  int nlocal = atomKK->nlocal;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_fieldforce_peratom>(0,nlocal),*this);
+  DeviceType::fence();
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_fieldforce_peratom, const int &i) const
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+  nx = d_part2grid(i,0);
+  ny = d_part2grid(i,1);
+  nz = d_part2grid(i,2);
+  dx = nx+shiftone - (x(i,0)-boxlo[0])*delxinv;
+  dy = ny+shiftone - (x(i,1)-boxlo[1])*delyinv;
+  dz = nz+shiftone - (x(i,2)-boxlo[2])*delzinv;
+
+  nz -= nzlo_out;
+  ny -= nylo_out;
+  nx -= nxlo_out;
+
+  compute_rho1d(i,dx,dy,dz);
+
+  u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+  for (n = nlower; n <= nupper; n++) {
+    mz = n+nz;
+    z0 = d_rho1d(i,n+order/2,2);
+    for (m = nlower; m <= nupper; m++) {
+      my = m+ny;
+      y0 = z0*d_rho1d(i,m+order/2,1);
+      for (l = nlower; l <= nupper; l++) {
+        mx = l+nx;
+        x0 = y0*d_rho1d(i,l+order/2,0);
+        if (eflag_atom) u += x0*d_u_brick(mz,my,mx);
+        if (vflag_atom) {
+          v0 += x0*d_v0_brick(mz,my,mx);
+          v1 += x0*d_v1_brick(mz,my,mx);
+          v2 += x0*d_v2_brick(mz,my,mx);
+          v3 += x0*d_v3_brick(mz,my,mx);
+          v4 += x0*d_v4_brick(mz,my,mx);
+          v5 += x0*d_v5_brick(mz,my,mx);
+        }
+      }
+    }
+  }
+
+  if (eflag_atom) d_eatom[i] += q[i]*u;
+  if (vflag_atom) {
+    d_vatom(i,0) += q[i]*v0;
+    d_vatom(i,1) += q[i]*v1;
+    d_vatom(i,2) += q[i]*v2;
+    d_vatom(i,3) += q[i]*v3;
+    d_vatom(i,4) += q[i]*v4;
+    d_vatom(i,5) += q[i]*v5;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack own values to buf to send to another proc
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::pack_forward_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+{
+  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
+  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
+  d_buf = k_buf.view<DeviceType>();
+
+  nx = (nxhi_out-nxlo_out+1);
+  ny = (nyhi_out-nylo_out+1);
+
+  if (flag == FORWARD_IK) {
+    copymode = 1;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_pack_forward1>(0,nlist),*this);
+    DeviceType::fence();
+    copymode = 0;
+  } else if (flag == FORWARD_IK_PERATOM) {
+    copymode = 1;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_pack_forward2>(0,nlist),*this);
+    DeviceType::fence();
+    copymode = 0;
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_forward1, const int &i) const
+{
+  const double dlist = (double) d_list_index[i];
+  const int iz = (int) (dlist/(nx*ny));
+  const int iy = (int) ((dlist - iz*nx*ny)/nx);
+  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
+  d_buf[3*i] = d_vdx_brick(iz,iy,ix);
+  d_buf[3*i+1] = d_vdy_brick(iz,iy,ix);
+  d_buf[3*i+2] = d_vdz_brick(iz,iy,ix);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_forward2, const int &i) const
+{
+  const double dlist = (double) d_list_index[i];
+  const int iz = (int) (dlist/(nx*ny));
+  const int iy = (int) ((dlist - iz*nx*ny)/nx);
+  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
+  if (eflag_atom) d_buf[7*i] = d_u_brick(iz,iy,ix);
+  if (vflag_atom) {
+    d_buf[7*i+1] = d_v0_brick(iz,iy,ix);
+    d_buf[7*i+2] = d_v1_brick(iz,iy,ix);
+    d_buf[7*i+3] = d_v2_brick(iz,iy,ix);
+    d_buf[7*i+4] = d_v3_brick(iz,iy,ix);
+    d_buf[7*i+5] = d_v4_brick(iz,iy,ix);
+    d_buf[7*i+6] = d_v5_brick(iz,iy,ix);
+  }
+}
+/* ----------------------------------------------------------------------
+   unpack another proc's own values from buf and set own ghost values
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::unpack_forward_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+{
+  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
+  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
+  d_buf = k_buf.view<DeviceType>();
+
+  nx = (nxhi_out-nxlo_out+1);
+  ny = (nyhi_out-nylo_out+1);
+
+  if (flag == FORWARD_IK) {
+    copymode = 1;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_unpack_forward1>(0,nlist),*this);
+    DeviceType::fence();
+    copymode = 0;
+  } else if (flag == FORWARD_IK_PERATOM) {
+    copymode = 1;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_unpack_forward2>(0,nlist),*this);
+    DeviceType::fence();
+    copymode = 0;
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward1, const int &i) const
+{
+  const double dlist = (double) d_list_index[i];
+  const int iz = (int) (dlist/(nx*ny));
+  const int iy = (int) ((dlist - iz*nx*ny)/nx);
+  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
+  d_vdx_brick(iz,iy,ix) = d_buf[3*i];
+  d_vdy_brick(iz,iy,ix) = d_buf[3*i+1];
+  d_vdz_brick(iz,iy,ix) = d_buf[3*i+2];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_forward2, const int &i) const
+{
+  const double dlist = (double) d_list_index[i];
+  const int iz = (int) (dlist/(nx*ny));
+  const int iy = (int) ((dlist - iz*nx*ny)/nx);
+  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
+  if (eflag_atom) d_u_brick(iz,iy,ix) = d_buf[7*i];
+  if (vflag_atom) {
+    d_v0_brick(iz,iy,ix) = d_buf[7*i+1];
+    d_v1_brick(iz,iy,ix) = d_buf[7*i+2];
+    d_v2_brick(iz,iy,ix) = d_buf[7*i+3];
+    d_v3_brick(iz,iy,ix) = d_buf[7*i+4];
+    d_v4_brick(iz,iy,ix) = d_buf[7*i+5];
+    d_v5_brick(iz,iy,ix) = d_buf[7*i+6];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack ghost values into buf to send to another proc
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::pack_reverse_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+{
+  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
+  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
+  d_buf = k_buf.view<DeviceType>();
+
+  nx = (nxhi_out-nxlo_out+1);
+  ny = (nyhi_out-nylo_out+1);
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_pack_reverse>(0,nlist),*this);
+  DeviceType::fence();
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_pack_reverse, const int &i) const
+{
+  const double dlist = (double) d_list_index[i];
+  const int iz = (int) (dlist/(nx*ny));
+  const int iy = (int) ((dlist - iz*nx*ny)/nx);
+  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
+  d_buf[i] = d_density_brick(iz,iy,ix);
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's ghost values from buf and add to own values
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::unpack_reverse_kokkos(int flag, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &k_buf, int nlist, DAT::tdual_int_2d &k_list, int index)
+{
+  typename AT::t_int_2d_um d_list = k_list.view<DeviceType>();
+  d_list_index = Kokkos::subview(d_list,index,Kokkos::ALL());
+  d_buf = k_buf.view<DeviceType>();
+
+  int nx = (nxhi_out-nxlo_out+1);
+  int ny = (nyhi_out-nylo_out+1);
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_unpack_reverse>(0,nlist),*this);
+  DeviceType::fence();
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_unpack_reverse, const int &i) const
+{
+  const double dlist = (double) d_list_index[i];
+  const int iz = (int) (dlist/(nx*ny));
+  const int iy = (int) ((dlist - iz*nx*ny)/nx);
+  const int ix = d_list_index[i] - iz*nx*ny - iy*nx;
+  d_density_brick(iz,iy,ix) += d_buf[i];
+}
+
+/* ----------------------------------------------------------------------
+   map nprocs to NX by NY grid as PX by PY procs - return optimal px,py
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py)
+{
+  // loop thru all possible factorizations of nprocs
+  // surf = surface area of largest proc sub-domain
+  // innermost if test minimizes surface area and surface/volume ratio
+
+  int bestsurf = 2 * (nx + ny);
+  int bestboxx = 0;
+  int bestboxy = 0;
+
+  int boxx,boxy,surf,ipx,ipy;
+
+  ipx = 1;
+  while (ipx <= nprocs) {
+    if (nprocs % ipx == 0) {
+      ipy = nprocs/ipx;
+      boxx = nx/ipx;
+      if (nx % ipx) boxx++;
+      boxy = ny/ipy;
+      if (ny % ipy) boxy++;
+      surf = boxx + boxy;
+      if (surf < bestsurf ||
+          (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) {
+        bestsurf = surf;
+        bestboxx = boxx;
+        bestboxy = boxy;
+        *px = ipx;
+        *py = ipy;
+      }
+    }
+    ipx++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::compute_rho1d(const int i, const FFT_SCALAR &dx, const FFT_SCALAR &dy,
+                         const FFT_SCALAR &dz) const
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = order-1; l >= 0; l--) {
+      r1 = d_rho_coeff(l,k-(1-order)/2) + r1*dx;
+      r2 = d_rho_coeff(l,k-(1-order)/2) + r2*dy;
+      r3 = d_rho_coeff(l,k-(1-order)/2) + r3*dz;
+    }
+    d_rho1d(i,k+order/2,0) = r1;
+    d_rho1d(i,k+order/2,1) = r2;
+    d_rho1d(i,k+order/2,2) = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   generate coeffients for the weight function of order n
+
+              (n-1)
+  Wn(x) =     Sum    wn(k,x) , Sum is over every other integer
+           k=-(n-1)
+  For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1
+      k is odd integers if n is even and even integers if n is odd
+              ---
+             | n-1
+             | Sum a(l,j)*(x-k/2)**l   if abs(x-k/2) < 1/2
+  wn(k,x) = <  l=0
+             |
+             |  0                       otherwise
+              ---
+  a coeffients are packed into the array rho_coeff to eliminate zeros
+  rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k)
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::compute_rho_coeff()
+{
+  int j,k,l,m;
+  FFT_SCALAR s;
+
+  //FFT_SCALAR **a;
+  //memory->create2d_offset(a,order,-order,order,"pppm:a");
+  FFT_SCALAR a[order][2*order+1];
+
+  for (k = 0; k <= 2*order; k++)
+    for (l = 0; l < order; l++)
+      a[l][k] = 0.0;
+
+  a[0][order] = 1.0;
+  for (j = 1; j < order; j++) {
+    for (k = -j; k <= j; k += 2) {
+      s = 0.0;
+      for (l = 0; l < j; l++) {
+        a[l+1][k+order] = (a[l][k+1+order]-a[l][k-1+order]) / (l+1);
+#ifdef FFT_SINGLE
+        s += powf(0.5,(float) l+1) *
+          (a[l][k-1+order] + powf(-1.0,(float) l) * a[l][k+1+order]) / (l+1);
+#else
+        s += pow(0.5,(double) l+1) *
+          (a[l][k-1+order] + pow(-1.0,(double) l) * a[l][k+1+order]) / (l+1);
+#endif
+      }
+      a[0][k+order] = s;
+    }
+  }
+
+  m = (1-order)/2;
+  for (k = -(order-1); k < order; k += 2) {
+    for (l = 0; l < order; l++)
+      h_rho_coeff(l,m-(1-order)/2) = a[l][k+order];
+    m++;
+  }
+  //memory->destroy2d_offset(a,-order);
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PPPMKokkos<DeviceType>::slabcorr()
+{
+  // compute local contribution to global dipole moment
+
+  zprd = domain->zprd;
+  int nlocal = atomKK->nlocal;
+
+  double dipole = 0.0;
+  copymode = 1;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr1>(0,nlocal),*this,dipole);
+  DeviceType::fence();
+  copymode = 0;
+
+  // sum local contributions to get global dipole moment
+
+  dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atomKK energy translationally invariant
+
+  dipole_r2 = 0.0;
+  if (eflag_atom || fabs(qsum) > SMALL) {
+    copymode = 1;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr2>(0,nlocal),*this,dipole_r2);
+    DeviceType::fence();
+    copymode = 0;
+
+    // sum local contributions
+
+    double tmp;
+    MPI_Allreduce(&dipole_r2,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+    dipole_r2 = tmp;
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all -
+    qsum*dipole_r2 - qsum*qsum*zprd*zprd/12.0)/volume;
+  qscale = qqrd2e * scale;
+
+  if (eflag_global) energy += qscale * e_slabcorr;
+
+  // per-atomKK energy
+
+  if (eflag_atom) {
+    efact = qscale * MY_2PI/volume;
+    copymode = 1;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr3>(0,nlocal),*this);
+    DeviceType::fence();
+    copymode = 0;
+  }
+
+  // add on force corrections
+
+  ffact = qscale * (-4.0*MY_PI/volume);
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_slabcorr4>(0,nlocal),*this);
+  DeviceType::fence();
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr1, const int &i, double &dipole) const
+{
+  dipole += q[i]*x(i,2);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr2, const int &i, double &dipole_r2) const
+{
+  dipole_r2 += q[i]*x(i,2)*x(i,2);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr3, const int &i) const
+{
+  d_eatom[i] += efact * q[i]*(x(i,2)*dipole_all - 0.5*(dipole_r2 +
+    qsum*x(i,2)*x(i,2)) - qsum*zprd*zprd/12.0);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_slabcorr4, const int &i) const
+{
+  f(i,2) += ffact * q[i]*(dipole_all - qsum*x(i,2));
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 1d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PPPMKokkos<DeviceType>::timing_1d(int n, double &time1d)
+{
+  double time1,time2;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_timing_zero>(0,2*nfft_both),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->timing1d(work1,nfft_both,1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time1d = time2 - time1;
+
+  return 4;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_timing_zero, const int &i) const
+{
+  d_work1[i] = ZEROF;
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 3d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PPPMKokkos<DeviceType>::timing_3d(int n, double &time3d)
+{
+  double time1,time2;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_timing_zero>(0,2*nfft_both),*this);
+  DeviceType::fence();
+  copymode = 0;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->compute(work1,work1,1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time3d = time2 - time1;
+
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PPPMKokkos<DeviceType>::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+  bytes += 4 * nbrick * sizeof(FFT_SCALAR);
+  if (triclinic) bytes += 3 * nfft_both * sizeof(double);
+  bytes += 6 * nfft_both * sizeof(double);
+  bytes += nfft_both * sizeof(double);
+  bytes += nfft_both*5 * sizeof(FFT_SCALAR);
+
+  if (peratom_allocate_flag)
+    bytes += 6 * nbrick * sizeof(FFT_SCALAR);
+
+  bytes += cg->memory_usage();
+
+  return bytes;
+}
+
+namespace LAMMPS_NS {
+template class PPPMKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PPPMKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
new file mode 100644
index 0000000000..4e6bb1d74c
--- /dev/null
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -0,0 +1,552 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>)
+KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>)
+KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PPPM_KOKKOS_H
+#define LMP_PPPM_KOKKOS_H
+
+#include "pppm.h"
+#include "gridcomm_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+struct TagPPPM_setup1{};
+struct TagPPPM_setup2{};
+struct TagPPPM_setup3{};
+struct TagPPPM_setup4{};
+struct TagPPPM_compute_gf_ik{};
+struct TagPPPM_compute_gf_ik_triclinic{};
+struct TagPPPM_self1{};
+struct TagPPPM_self2{};
+struct TagPPPM_brick2fft{};
+struct TagPPPM_particle_map{};
+struct TagPPPM_make_rho_zero{};
+struct TagPPPM_make_rho_atomic{};
+struct TagPPPM_make_rho{};
+struct TagPPPM_poisson_ik1{};
+struct TagPPPM_poisson_ik2{};
+struct TagPPPM_poisson_ik3{};
+struct TagPPPM_poisson_ik4{};
+struct TagPPPM_poisson_ik5{};
+struct TagPPPM_poisson_ik6{};
+struct TagPPPM_poisson_ik7{};
+struct TagPPPM_poisson_ik8{};
+struct TagPPPM_poisson_ik9{};
+struct TagPPPM_poisson_ik10{};
+struct TagPPPM_poisson_ik_triclinic1{};
+struct TagPPPM_poisson_ik_triclinic2{};
+struct TagPPPM_poisson_ik_triclinic3{};
+struct TagPPPM_poisson_ik_triclinic4{};
+struct TagPPPM_poisson_ik_triclinic5{};
+struct TagPPPM_poisson_ik_triclinic6{};
+struct TagPPPM_poisson_peratom1{};
+struct TagPPPM_poisson_peratom2{};
+struct TagPPPM_poisson_peratom3{};
+struct TagPPPM_poisson_peratom4{};
+struct TagPPPM_poisson_peratom5{};
+struct TagPPPM_poisson_peratom6{};
+struct TagPPPM_poisson_peratom7{};
+struct TagPPPM_poisson_peratom8{};
+struct TagPPPM_poisson_peratom9{};
+struct TagPPPM_poisson_peratom10{};
+struct TagPPPM_poisson_peratom11{};
+struct TagPPPM_poisson_peratom12{};
+struct TagPPPM_poisson_peratom13{};
+struct TagPPPM_poisson_peratom14{};
+struct TagPPPM_fieldforce_ik{};
+struct TagPPPM_fieldforce_peratom{};
+struct TagPPPM_pack_forward1{};
+struct TagPPPM_pack_forward2{};
+struct TagPPPM_unpack_forward1{};
+struct TagPPPM_unpack_forward2{};
+struct TagPPPM_pack_reverse{};
+struct TagPPPM_unpack_reverse{};
+struct TagPPPM_slabcorr1{};
+struct TagPPPM_slabcorr2{};
+struct TagPPPM_slabcorr3{};
+struct TagPPPM_slabcorr4{};
+struct TagPPPM_timing_zero{};
+
+template<class DeviceType>
+class PPPMKokkos : public PPPM {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  PPPMKokkos(class LAMMPS *, int, char **);
+  virtual ~PPPMKokkos();
+  virtual void init();
+  virtual void setup();
+  void setup_grid();
+  virtual void compute(int, int);
+  virtual int timing_1d(int, double &);
+  virtual int timing_3d(int, double &);
+  virtual double memory_usage();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_setup1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_setup2, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_setup3, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_setup4, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_compute_gf_ik, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_compute_gf_ik_triclinic, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_self1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_self2, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_brick2fft, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_particle_map, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_make_rho_zero, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_make_rho_atomic, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_make_rho, typename Kokkos::TeamPolicy<DeviceType, TagPPPM_make_rho>::member_type) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik2, const int&, EV_FLOAT &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik3, const int&, EV_FLOAT &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik4, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik5, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik6, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik7, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik8, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik9, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik10, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik_triclinic1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik_triclinic2, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik_triclinic3, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik_triclinic4, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik_triclinic5, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_ik_triclinic6, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom2, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom3, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom4, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom5, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom6, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom7, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom8, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom9, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom10, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom11, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom12, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom13, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_poisson_peratom14, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_fieldforce_ik, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_fieldforce_peratom, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_pack_forward1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_pack_forward2, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_unpack_forward1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_unpack_forward2, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_pack_reverse, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_unpack_reverse, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_slabcorr1, const int&, double&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_slabcorr2, const int&, double&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_slabcorr3, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_slabcorr4, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_timing_zero, const int&) const;
+
+  //template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  //KOKKOS_INLINE_FUNCTION
+  //void operator()(TagPPPMKernelA<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+ protected:
+  double unitkx,unitky,unitkz;
+  double scaleinv,s2;
+  double qscale,efact,ffact,dipole_all,dipole_r2,zprd;
+  double xprd,yprd,zprd_slab;
+  int nbx,nby,nbz,twoorder;
+  int numx_fft,numy_fft,numz_fft;
+  int numx_inout,numy_inout,numz_inout;
+  int numx_out,numy_out,numz_out;
+  int ix,iy,nlocal;
+
+  int nx,ny,nz;
+  typename AT::t_int_1d_um d_list_index;
+  typename AT::t_FFT_SCALAR_1d_um d_buf;
+
+  DAT::tdual_int_scalar k_flag;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_float_1d_randomread q;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+
+  int factors[3];
+
+  typename AT::t_FFT_SCALAR_3d d_density_brick;
+  typename AT::t_FFT_SCALAR_3d d_vdx_brick,d_vdy_brick,d_vdz_brick;
+  typename AT::t_FFT_SCALAR_3d d_u_brick;
+  typename AT::t_FFT_SCALAR_3d d_v0_brick,d_v1_brick,d_v2_brick;
+  typename AT::t_FFT_SCALAR_3d d_v3_brick,d_v4_brick,d_v5_brick;
+  typename AT::t_float_1d d_greensfn;
+  typename AT::t_virial_array d_vg;
+  typename AT::t_float_1d d_fkx;
+  typename AT::t_float_1d d_fky;
+  typename AT::t_float_1d d_fkz;
+  DAT::tdual_FFT_SCALAR_1d k_density_fft;
+  DAT::tdual_FFT_SCALAR_1d k_work1;
+  DAT::tdual_FFT_SCALAR_1d k_work2;
+  typename AT::t_FFT_SCALAR_1d d_density_fft;
+  typename AT::t_FFT_SCALAR_1d d_work1;
+  typename AT::t_FFT_SCALAR_1d d_work2;
+
+  DAT::tdual_float_1d k_gf_b;
+  typename AT::t_float_1d d_gf_b;
+
+  //FFT_SCALAR **rho1d,**rho_coeff,**drho1d,**drho_coeff;
+  typename AT::t_FFT_SCALAR_2d_3 d_rho1d;
+  DAT::tdual_FFT_SCALAR_2d k_rho_coeff;
+  typename AT::t_FFT_SCALAR_2d d_rho_coeff;
+  HAT::t_FFT_SCALAR_2d h_rho_coeff;
+  //double **acons;
+  typename Kokkos::DualView<F_FLOAT[8][7],Kokkos::LayoutRight,DeviceType>::t_host acons;
+
+  class FFT3d *fft1,*fft2;
+  class Remap *remap;
+  GridCommKokkos<DeviceType> *cg;
+  GridCommKokkos<DeviceType> *cg_peratom;
+
+  //int **part2grid;             // storage for particle -> grid mapping
+  typename AT::t_int_1d_3 d_part2grid;
+
+  //double *boxlo;
+  double boxlo[3];
+
+  void set_grid_global();
+  void set_grid_local();
+  void adjust_gewald();
+  double newton_raphson_f();
+  double derivf();
+  double final_accuracy();
+
+  virtual void allocate();
+  virtual void allocate_peratom();
+  virtual void deallocate();
+  virtual void deallocate_peratom();
+  int factorable(int);
+  double compute_df_kspace();
+  double estimate_ik_error(double, double, bigint);
+  virtual void compute_gf_denom();
+  virtual void compute_gf_ik();
+
+  virtual void particle_map();
+  virtual void make_rho();
+  virtual void brick2fft();
+
+  virtual void poisson();
+  virtual void poisson_ik();
+
+  virtual void fieldforce();
+  virtual void fieldforce_ik();
+
+  virtual void poisson_peratom();
+  virtual void fieldforce_peratom();
+  void procs2grid2d(int,int,int,int *, int*);
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_rho1d(const int i, const FFT_SCALAR &, const FFT_SCALAR &,
+                     const FFT_SCALAR &) const;
+  void compute_rho_coeff();
+  void slabcorr();
+
+  // grid communication
+
+  virtual void pack_forward_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int);
+  virtual void unpack_forward_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int);
+  virtual void pack_reverse_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int);
+  virtual void unpack_reverse_kokkos(int, Kokkos::DualView<FFT_SCALAR*,Kokkos::LayoutRight,LMPDeviceType> &, int, DAT::tdual_int_2d &, int);
+
+  // triclinic
+
+  int triclinic;               // domain settings, orthog or triclinic
+  void setup_triclinic();
+  void compute_gf_ik_triclinic();
+  void poisson_ik_triclinic();
+
+/* ----------------------------------------------------------------------
+   denominator for Hockney-Eastwood Green's function
+     of x,y,z = sin(kx*deltax/2), etc
+
+            inf                 n-1
+   S(n,k) = Sum  W(k+pi*j)**2 = Sum b(l)*(z*z)**l
+           j=-inf               l=0
+
+          = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x)  at z = sin(x)
+   gf_b = denominator expansion coeffs
+------------------------------------------------------------------------- */
+
+  KOKKOS_INLINE_FUNCTION
+  double gf_denom(const double &x, const double &y,
+                         const double &z) const {
+    double sx,sy,sz;
+    sz = sy = sx = 0.0;
+    for (int l = order-1; l >= 0; l--) {
+      sx = d_gf_b[l] + sx*x;
+      sy = d_gf_b[l] + sy*y;
+      sz = d_gf_b[l] + sz*z;
+    }
+    double s = sx*sy*sz;
+    return s*s;
+  };
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Cannot (yet) use PPPM with triclinic box and kspace_modify diff ad
+
+This feature is not yet supported.
+
+E: Cannot (yet) use PPPM with triclinic box and slab correction
+
+This feature is not yet supported.
+
+E: Cannot use PPPM with 2d simulation
+
+The kspace style pppm cannot be used in 2d simulations.  You can use
+2d PPPM in a 3d simulation; see the kspace_modify command.
+
+E: PPPM can only currently be used with comm_style brick
+
+This is a current restriction in LAMMPS.
+
+E: Kspace style requires atom attribute q
+
+The atom style defined does not have these attributes.
+
+E: Cannot use nonperiodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range Coulombic or dispersion components be used.
+
+E: Pair style is incompatible with TIP4P KSpace style
+
+The pair style does not have the requires TIP4P settings.
+
+E: Bond and angle potentials must be defined for TIP4P
+
+Cannot use TIP4P pair potential unless bond and angle potentials
+are defined.
+
+E: Bad TIP4P angle type for PPPM/TIP4P
+
+Specified angle type is not valid.
+
+E: Bad TIP4P bond type for PPPM/TIP4P
+
+Specified bond type is not valid.
+
+E: Cannot (yet) use PPPM with triclinic box and TIP4P
+
+This feature is not yet supported.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired.  See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2.  This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy.  This error should not occur for typical problems.  Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096.  You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald.  This error should not occur for typical problems.  Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor.  This is likely for one of two
+reasons, both of them bad.  First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors.  This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command.  The safest settings are "delay 0
+every 1 check yes".  Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+*/
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 9b18ad88bb..84b77a1905 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -152,6 +152,8 @@ PPPM::PPPM(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg)
 
 PPPM::~PPPM()
 {
+  if (copymode) return;
+
   delete [] factors;
   deallocate();
   if (peratom_allocate_flag) deallocate_peratom();
diff --git a/src/USER-SMD/README b/src/USER-SMD/README
index 91961c4923..924157ac97 100644
--- a/src/USER-SMD/README
+++ b/src/USER-SMD/README
@@ -1,5 +1,25 @@
 Smooth Mach Dynamics -- Smooth Particle Hydrodynamics for solids and fluids
 
+Currently, the package has the following features:
+
+Does liquids via traditional Smooth Particle Hydrodynamics (SPH)
+
+Also solves solids mechanics problems via a state of the art
+stabilized meshless method with hourglass control.
+
+Can specify hydrostatic interactions independently from material
+strength models, i.e. pressure and deviatoric stresses are separated.
+
+Many material models available (Johnson-Cook, plasticity with
+hardening, Mie-Grueneisen, Polynomial EOS). Easy to add new material
+models.
+
+Rigid boundary conditions (walls) can be loaded as surface geometries
+from *.STL files.
+
+See the file doc/PDF/SMD_LAMMPS_userguide.pdf to get started.
+
+There are example scripts for using this package in examples/USER/smd.
 The person who created this package is Georg Ganzenmuller at the
 Fraunhofer-Institute for High-Speed Dynamics, Ernst Mach Institute in
 Germany (georg.ganzenmueller at emi.fhg.de).  Contact him directly if
diff --git a/src/accelerator_kokkos.h b/src/accelerator_kokkos.h
index ecba20da65..2a787ef116 100644
--- a/src/accelerator_kokkos.h
+++ b/src/accelerator_kokkos.h
@@ -87,6 +87,7 @@ class ModifyKokkos : public Modify {
 class DAT {
  public:
   typedef double tdual_xfloat_1d;
+  typedef double tdual_FFT_SCALAR_1d;
   typedef int t_int_1d;
   typedef int tdual_int_2d;
 };
diff --git a/src/kspace.cpp b/src/kspace.cpp
index d943b02c78..d5123958a1 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -94,6 +94,7 @@ KSpace::KSpace(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   execution_space = Host;
   datamask_read = ALL_MASK;
   datamask_modify = ALL_MASK;
+  copymode = 0;
 
   memory->create(gcons,7,7,"kspace:gcons");
   gcons[2][0] = 15.0 / 8.0;
@@ -149,6 +150,8 @@ KSpace::KSpace(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
 
 KSpace::~KSpace()
 {
+  if (copymode) return;
+
   memory->destroy(eatom);
   memory->destroy(vatom);
   memory->destroy(gcons);
diff --git a/src/kspace.h b/src/kspace.h
index 5519a8c56f..e52fb1ec66 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -15,6 +15,7 @@
 #define LMP_KSPACE_H
 
 #include "pointers.h"
+#include "accelerator_kokkos.h"
 
 #ifdef FFT_SINGLE
 typedef float FFT_SCALAR;
@@ -85,6 +86,7 @@ class KSpace : protected Pointers {
   // KOKKOS host/device flag and data masks
   ExecutionSpace execution_space;
   unsigned int datamask_read,datamask_modify;
+  int copymode;
 
   int compute_flag;               // 0 if skip compute()
   int fftbench;                   // 0 if skip FFT timing
@@ -124,6 +126,11 @@ class KSpace : protected Pointers {
   virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {};
   virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {};
 
+  virtual void pack_forward_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_forward_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void pack_reverse_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+  virtual void unpack_reverse_kokkos(int, DAT::tdual_FFT_SCALAR_1d &, int, DAT::tdual_int_2d &, int) {};
+
   virtual int timing(int, double &, double &) {return 0;}
   virtual int timing_1d(int, double &) {return 0;}
   virtual int timing_3d(int, double &) {return 0;}