diff --git a/doc/Section_commands.html b/doc/Section_commands.html
index 70bc1e8857..5f996268de 100644
--- a/doc/Section_commands.html
+++ b/doc/Section_commands.html
@@ -399,12 +399,13 @@ potentials.  Click on the style itself for a full description:
 <TR ALIGN="center"><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/gpu</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/opt</A></TD><TD ><A HREF = "pair_class2.html">lj/class2</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/cut</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_class2.html">lj/class2/coul/long</A></TD><TD ><A HREF = "pair_lj.html">lj/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/opt</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/tip4p</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lubricate.html">lubricate</A></TD><TD ><A HREF = "pair_meam.html">meam</A></TD><TD ><A HREF = "pair_morse.html">morse</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/lps</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb</A></TD><TD ><A HREF = "pair_reax.html">reax</A></TD><TD ><A HREF = "pair_resquared.html">resquared</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_soft.html">soft</A></TD><TD ><A HREF = "pair_sw.html">sw</A></TD><TD ><A HREF = "pair_table.html">table</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid</A> 
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/tip4p</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj96_cut.html">lj96/cut/gpu</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate</A></TD><TD ><A HREF = "pair_meam.html">meam</A></TD><TD ><A HREF = "pair_morse.html">morse</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_morse.html">morse/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD><TD ><A HREF = "pair_peri.html">peri/lps</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_reax.html">reax</A></TD><TD ><A HREF = "pair_resquared.html">resquared</A></TD><TD ><A HREF = "pair_soft.html">soft</A></TD><TD ><A HREF = "pair_sw.html">sw</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_table.html">table</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff</A></TD><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are pair styles contributed by users, which can be used if
@@ -483,7 +484,8 @@ description:
 Kspace solvers.  Click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
-<TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">ewald</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/tip4p</A> 
+<TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">ewald</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/gpu/single</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/gpu/double</A></TD></TR>
+<TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/tip4p</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are Kspace solvers contributed by users, which can be used if
diff --git a/doc/Section_commands.txt b/doc/Section_commands.txt
index 1a18b8b9b2..1c58401303 100644
--- a/doc/Section_commands.txt
+++ b/doc/Section_commands.txt
@@ -611,6 +611,7 @@ potentials.  Click on the style itself for a full description:
 "lj/cut/coul/long/gpu"_pair_lj.html,
 "lj/cut/coul/long/tip4p"_pair_lj.html,
 "lj/expand"_pair_lj_expand.html,
+"lj/expand/gpu"_pair_lj_expand.html,
 "lj/gromacs"_pair_gromacs.html,
 "lj/gromacs/coul/gromacs"_pair_gromacs.html,
 "lj/smooth"_pair_lj_smooth.html,
@@ -619,6 +620,7 @@ potentials.  Click on the style itself for a full description:
 "lubricate"_pair_lubricate.html,
 "meam"_pair_meam.html,
 "morse"_pair_morse.html,
+"morse/gpu"_pair_morse.html,
 "morse/opt"_pair_morse.html,
 "peri/lps"_pair_peri.html,
 "peri/pmb"_pair_peri.html,
@@ -728,6 +730,8 @@ Kspace solvers.  Click on the style itself for a full description:
 
 "ewald"_kspace_style.html,
 "pppm"_kspace_style.html,
+"pppm/gpu/single"_kspace_style.html,
+"pppm/gpu/double"_kspace_style.html,
 "pppm/tip4p"_kspace_style.html :tb(c=4,ea=c,w=100)
 
 These are Kspace solvers contributed by users, which can be used if
diff --git a/doc/Section_errors.html b/doc/Section_errors.html
index 8cd9ed6e46..b90784e0c0 100644
--- a/doc/Section_errors.html
+++ b/doc/Section_errors.html
@@ -173,6 +173,10 @@ the bond topologies you have defined.
 neighbors for each atom.  This likely means something is wrong with
 the bond topologies you have defined. 
 
+<DT><I>Accelerated style in input script but no fix gpu</I> 
+
+<DD>GPU acceleration requires fix gpu in the input script. 
+
 <DT><I>All angle coeffs are not set</I> 
 
 <DD>All angle coefficients must be set in the data file or by the
@@ -1240,9 +1244,9 @@ non-periodic z dimension.
 unless you use the kspace_modify command to define a 2d slab with a
 non-periodic z dimension. 
 
-<DT><I>Cannot use pair hybrid with multiple GPU pair styles</I> 
+<DT><I>Cannot use pair hybrid with GPU neighbor builds</I> 
 
-<DD>Self-explanatory. 
+<DD>See documentation for fix gpu. 
 
 <DT><I>Cannot use pair tail corrections with 2d simulations</I> 
 
@@ -1843,7 +1847,7 @@ does not exist.
 
 <DD>Self-explanatory. 
 
-<DT><I>Could not find or initialize a specified accelerator device</I> 
+<DT><I>Could not find/initialize a specified accelerator device</I> 
 
 <DD>Your GPU setup is invalid. 
 
@@ -2123,6 +2127,10 @@ model.
 used.  Most likely, one or more atoms have been blown out of the
 simulation box to a great distance. 
 
+<DT><I>Double precision is not supported on this accelerator.</I> 
+
+<DD>In this case, you must compile the GPU library for single precision. 
+
 <DT><I>Dump cfg and fix not computed at compatible times</I> 
 
 <DD>The fix must produce per-atom quantities on timesteps that dump cfg
@@ -2355,6 +2363,10 @@ smaller simulation or on more processors.
 
 <DD>Self-explanatory. 
 
+<DT><I>Fix gpu split must be positive for hybrid pair styles.</I> 
+
+<DD>See documentation for fix gpu. 
+
 <DT><I>Fix ID for compute atom/molecule does not exist</I> 
 
 <DD>Self-explanatory. 
@@ -3227,6 +3239,11 @@ this fix.
 
 <DD>This is the way the fix must be defined in your input script. 
 
+<DT><I>GPU library not compiled for this accelerator</I> 
+
+<DD>The GPU library was not built for your accelerator. Check the arch flag in
+lib/gpu. 
+
 <DT><I>Gmask function in equal-style variable formula</I> 
 
 <DD>Gmask is per-atom operation. 
@@ -3509,7 +3526,7 @@ simulation box.
 
 <DD>Eigensolve for rigid body was not sufficiently accurate. 
 
-<DT><I>Insufficient memory on accelerator (or no fix gpu)</I> 
+<DT><I>Insufficient memory on accelerator. </I> 
 
 <DD>Self-explanatory. 
 
@@ -4587,10 +4604,6 @@ contain the same atom.
 <DD>Any rigid body defined by the fix rigid command must contain 2 or more
 atoms. 
 
-<DT><I>Out of memory on GPGPU</I> 
-
-<DD>You are attempting to run with too many atoms on the GPU. 
-
 <DT><I>Out of range atoms - cannot compute PPPM</I> 
 
 <DD>One or more atoms are attempting to map their charge to a PPPM grid
diff --git a/doc/Section_errors.txt b/doc/Section_errors.txt
index d94e8a9be7..0e2b2e804b 100644
--- a/doc/Section_errors.txt
+++ b/doc/Section_errors.txt
@@ -170,6 +170,10 @@ An inconsistency was detected when computing the number of 1-4
 neighbors for each atom.  This likely means something is wrong with
 the bond topologies you have defined. :dd
 
+{Accelerated style in input script but no fix gpu} :dt
+
+GPU acceleration requires fix gpu in the input script. :dd
+
 {All angle coeffs are not set} :dt
 
 All angle coefficients must be set in the data file or by the
@@ -1237,9 +1241,9 @@ For kspace style pppm, all 3 dimensions must have periodic boundaries
 unless you use the kspace_modify command to define a 2d slab with a
 non-periodic z dimension. :dd
 
-{Cannot use pair hybrid with multiple GPU pair styles} :dt
+{Cannot use pair hybrid with GPU neighbor builds} :dt
 
-Self-explanatory. :dd
+See documentation for fix gpu. :dd
 
 {Cannot use pair tail corrections with 2d simulations} :dt
 
@@ -1840,7 +1844,7 @@ The compute ID for computing temperature does not exist. :dd
 
 Self-explanatory. :dd
 
-{Could not find or initialize a specified accelerator device} :dt
+{Could not find/initialize a specified accelerator device} :dt
 
 Your GPU setup is invalid. :dd
 
@@ -2120,6 +2124,10 @@ The domain has become extremely large so that neighbor bins cannot be
 used.  Most likely, one or more atoms have been blown out of the
 simulation box to a great distance. :dd
 
+{Double precision is not supported on this accelerator.} :dt
+
+In this case, you must compile the GPU library for single precision. :dd
+
 {Dump cfg and fix not computed at compatible times} :dt
 
 The fix must produce per-atom quantities on timesteps that dump cfg
@@ -2352,6 +2360,10 @@ This is not allowed.  Make your SRD bin size smaller. :dd
 
 Self-explanatory. :dd
 
+{Fix gpu split must be positive for hybrid pair styles.} :dt
+
+See documentation for fix gpu. :dd
+
 {Fix ID for compute atom/molecule does not exist} :dt
 
 Self-explanatory. :dd
@@ -3224,6 +3236,11 @@ When using a "*" in the restart file name, no matching file was found. :dd
 
 This is the way the fix must be defined in your input script. :dd
 
+{GPU library not compiled for this accelerator} :dt
+
+The GPU library was not built for your accelerator. Check the arch flag in
+lib/gpu. :dd
+
 {Gmask function in equal-style variable formula} :dt
 
 Gmask is per-atom operation. :dd
@@ -3506,7 +3523,7 @@ Eigensolve for rigid body was not sufficiently accurate. :dd
 
 Eigensolve for rigid body was not sufficiently accurate. :dd
 
-{Insufficient memory on accelerator (or no fix gpu)} :dt
+{Insufficient memory on accelerator. } :dt
 
 Self-explanatory. :dd
 
@@ -4584,10 +4601,6 @@ contain the same atom. :dd
 Any rigid body defined by the fix rigid command must contain 2 or more
 atoms. :dd
 
-{Out of memory on GPGPU} :dt
-
-You are attempting to run with too many atoms on the GPU. :dd
-
 {Out of range atoms - cannot compute PPPM} :dt
 
 One or more atoms are attempting to map their charge to a PPPM grid
diff --git a/doc/Section_intro.html b/doc/Section_intro.html
index f9b00bb689..bce1a9d718 100644
--- a/doc/Section_intro.html
+++ b/doc/Section_intro.html
@@ -505,6 +505,14 @@ the list.
 
 
 <DIV ALIGN=center><TABLE  BORDER=1 >
+<TR><TD >pppm GPU single and double </TD><TD > Mike Brown (ORNL)</TD></TR>
+<TR><TD >pair_style lj/cut/expand </TD><TD > Inderaj Bains (NVIDIA)</TD></TR>
+<TR><TD >temperature accelerated dynamics (TAD) </TD><TD > Aidan Thompson (Sandia)</TD></TR>
+<TR><TD >pair reax/c and fix qeq/reax </TD><TD > Metin Aktulga (Purdue, now LBNL)</TD></TR>
+<TR><TD >DREIDING force field, pair_style hbond/dreiding, etc </TD><TD > Tod Pascal (CalTech)</TD></TR>
+<TR><TD >fix adapt and compute ti for thermodynamic integreation for free energies </TD><TD > Sai Jayaraman (Sandia)</TD></TR>
+<TR><TD >pair born and pair gauss </TD><TD > Sai Jayaraman (Sandia)</TD></TR>
+<TR><TD >stochastic rotation dynamics (SRD) via fix srd </TD><TD > Jemery Lechman (Sandia) and Pieter in 't Veld (BASF)</TD></TR>
 <TR><TD >ipp Perl script tool </TD><TD > Reese Jones (Sandia)</TD></TR>
 <TR><TD >eam_database and createatoms tools </TD><TD > Xiaowang Zhou (Sandia)</TD></TR>
 <TR><TD >electron force field (eFF) </TD><TD > Andres Jaramillo-Botero and Julius Su (Caltech)</TD></TR>
diff --git a/doc/Section_intro.txt b/doc/Section_intro.txt
index e4c26c8aab..a8e46df996 100644
--- a/doc/Section_intro.txt
+++ b/doc/Section_intro.txt
@@ -490,6 +490,14 @@ the list.
 
 :link(sjp,http://www.sandia.gov/~sjplimp)
 
+pppm GPU single and double : Mike Brown (ORNL)
+pair_style lj/cut/expand : Inderaj Bains (NVIDIA)
+temperature accelerated dynamics (TAD) : Aidan Thompson (Sandia)
+pair reax/c and fix qeq/reax : Metin Aktulga (Purdue, now LBNL)
+DREIDING force field, pair_style hbond/dreiding, etc : Tod Pascal (CalTech)
+fix adapt and compute ti for thermodynamic integreation for free energies : Sai Jayaraman (Sandia)
+pair born and pair gauss : Sai Jayaraman (Sandia)
+stochastic rotation dynamics (SRD) via fix srd : Jemery Lechman (Sandia) and Pieter in 't Veld (BASF)
 ipp Perl script tool : Reese Jones (Sandia)
 eam_database and createatoms tools : Xiaowang Zhou (Sandia)
 electron force field (eFF) : Andres Jaramillo-Botero and Julius Su (Caltech)
diff --git a/doc/Section_start.html b/doc/Section_start.html
index a83aaa0ad5..08287e3377 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -994,143 +994,130 @@ processing units (GPUs).  We plan to add more over time.  Currently,
 they only support NVIDIA GPU cards.  To use them you need to install
 certain NVIDIA CUDA software on your system:
 </P>
-<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
-<LI>Go to http://www.nvidia.com/object/cuda_get.html
-<LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
-<LI>Follow the instructions in README in lammps/lib/gpu to build the library.
-<LI>Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties 
+<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go
+<LI>to http://www.nvidia.com/object/cuda_get.html Install a driver and
+<LI>toolkit appropriate for your system (SDK is not necessary) Follow the
+<LI>instructions in README in lammps/lib/gpu to build the library.  Run
+<LI>lammps/lib/gpu/nvc_get_devices to list supported devices and
+<LI>properties 
 </UL>
 <H4>GPU configuration 
 </H4>
 <P>When using GPUs, you are restricted to one physical GPU per LAMMPS
-process. Multiple processes can share a single GPU and in many cases it
-will be more efficient to run with multiple processes per GPU. Any GPU
-accelerated style requires that <A HREF = "fix_gpu.html">fix gpu</A> be used in the
-input script to select and initialize the GPUs. The format for the fix
-is:
+process. Multiple processes can share a single GPU and in many cases
+it will be more efficient to run with multiple processes per GPU. Any
+GPU accelerated style requires that <A HREF = "fix_gpu.html">fix gpu</A> be used in
+the input script to select and initialize the GPUs. The format for the
+fix is:
 </P>
 <PRE>fix <I>name</I> all gpu <I>mode</I> <I>first</I> <I>last</I> <I>split</I> 
 </PRE>
 <P>where <I>name</I> is the name for the fix. The gpu fix must be the first
-fix specified for a given run, otherwise the program will exit
-with an error. The gpu fix will not have any effect on runs 
-that do not use GPU acceleration; there should be no problem
-with specifying the fix first in any input script.
+fix specified for a given run, otherwise the program will exit with an
+error. The gpu fix will not have any effect on runs that do not use
+GPU acceleration; there should be no problem with specifying the fix
+first in any input script.
 </P>
-<P><I>mode</I> can be either "force" or "force/neigh". In the former,
-neighbor list calculation is performed on the CPU using the
-standard LAMMPS routines. In the latter, the neighbor list
-calculation is performed on the GPU. The GPU neighbor list
-can be used for better performance, however, it 
-should not be used with a triclinic box.
+<P><I>mode</I> can be either "force" or "force/neigh". In the former, neighbor
+list calculation is performed on the CPU using the standard LAMMPS
+routines. In the latter, the neighbor list calculation is performed on
+the GPU. The GPU neighbor list can be used for better performance,
+however, it cannot not be used with a triclinic box or with
+<A HREF = "pair_hybrid.html">hybrid</A> pair styles.
 </P>
-<P>There are cases when it might be more efficient to select the CPU for neighbor
-list builds. If a non-GPU enabled style requires a neighbor list, it will also
-be built using CPU routines. Redundant CPU and GPU neighbor list calculations
-will typically be less efficient. For <A HREF = "pair_hybrid.html">hybrid</A> pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+<P>There are cases when it might be more efficient to select the CPU for
+neighbor list builds. If a non-GPU enabled style requires a neighbor
+list, it will also be built using CPU routines. Redundant CPU and GPU
+neighbor list calculations will typically be less efficient.
 </P>
-<P><I>first</I> is the ID (as reported by lammps/lib/gpu/nvc_get_devices)
-of the first GPU that will be used on each node. <I>last</I> is the
-ID of the last GPU that will be used on each node. If you have
-only one GPU per node, <I>first</I> and <I>last</I> will typically both be
-0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3)
-is not currently supported.
+<P><I>first</I> is the ID (as reported by lammps/lib/gpu/nvc_get_devices) of
+the first GPU that will be used on each node. <I>last</I> is the ID of the
+last GPU that will be used on each node. If you have only one GPU per
+node, <I>first</I> and <I>last</I> will typically both be 0. Selecting a
+non-sequential set of GPU IDs (e.g. 0,1,3) is not currently supported.
 </P>
-<P><I>split</I> is the fraction of particles whose forces, torques,
-energies, and/or virials will be calculated on the GPU. This
-can be used to perform CPU and GPU force calculations
-simultaneously. If <I>split</I> is negative, the software will
-attempt to calculate the optimal fraction automatically 
-every 25 timesteps based on CPU and GPU timings. Because the GPU speedups
-are dependent on the number of particles, automatic calculation of the
-split can be less efficient, but typically results in loop times
-within 20% of an optimal fixed split.
+<P><I>split</I> is the fraction of particles whose forces, torques, energies,
+and/or virials will be calculated on the GPU. This can be used to
+perform CPU and GPU force calculations simultaneously. If <I>split</I> is
+negative, the software will attempt to calculate the optimal fraction
+automatically every 25 timesteps based on CPU and GPU timings. Because
+the GPU speedups are dependent on the number of particles, automatic
+calculation of the split can be less efficient, but typically results
+in loop times within 20% of an optimal fixed split.
 </P>
-<P>If you have two GPUs per node, 8 CPU cores per node, and
-would like to run on 4 nodes with dynamic balancing of
-force calculation across CPU and GPU cores, the fix
-might be
+<P>If you have two GPUs per node, 8 CPU cores per node, and would like to
+run on 4 nodes with dynamic balancing of force calculation across CPU
+and GPU cores, the fix might be
 </P>
 <PRE>fix 0 all gpu force/neigh 0 1 -1 
 </PRE>
-<P>with LAMMPS run on 32 processes. In this case, all
-CPU cores and GPU devices on the nodes would be utilized.
-Each GPU device would be shared by 4 CPU cores. The
-CPU cores would perform force calculations for some
-fraction of the particles at the same time the GPUs
-performed force calculation for the other particles.
+<P>with LAMMPS run on 32 processes. In this case, all CPU cores and GPU
+devices on the nodes would be utilized.  Each GPU device would be
+shared by 4 CPU cores. The CPU cores would perform force calculations
+for some fraction of the particles at the same time the GPUs performed
+force calculation for the other particles.
 </P>
-<P>Because of the large number of cores on each GPU
-device, it might be more efficient to run on fewer
-processes per GPU when the number of particles per process
-is small (100's of particles); this can be necessary
-to keep the GPU cores busy.
+<P>Because of the large number of cores on each GPU device, it might be
+more efficient to run on fewer processes per GPU when the number of
+particles per process is small (100's of particles); this can be
+necessary to keep the GPU cores busy.
 </P>
 <H4>GPU input script 
 </H4>
-<P>In order to use GPU acceleration in LAMMPS, 
-<A HREF = "fix_gpu.html">fix_gpu</A>
-should be used in order to initialize and configure the
-GPUs for use. Additionally, GPU enabled styles must be
-selected in the input script. Currently,
-this is limited to a few <A HREF = "pair_style.html">pair styles</A>.
-Some GPU-enabled styles have additional restrictions
-listed in their documentation.
+<P>In order to use GPU acceleration in LAMMPS, <A HREF = "fix_gpu.html">fix_gpu</A>
+should be used in order to initialize and configure the GPUs for
+use. Additionally, GPU enabled styles must be selected in the input
+script. Currently, this is limited to a few <A HREF = "pair_style.html">pair
+styles</A> and PPPM.  Some GPU-enabled styles have
+additional restrictions listed in their documentation.
 </P>
 <H4>GPU asynchronous pair computation 
 </H4>
-<P>The GPU accelerated pair styles can be used to perform
-pair style force calculation on the GPU while other 
-calculations are
-performed on the CPU. One method to do this is to specify
-a <I>split</I> in the gpu fix as described above. In this case,
-force calculation for the pair style will also be performed
-on the CPU. 
+<P>The GPU accelerated pair styles can be used to perform pair style
+force calculation on the GPU while other calculations are performed on
+the CPU. One method to do this is to specify a <I>split</I> in the gpu fix
+as described above.  In this case, force calculation for the pair
+style will also be performed on the CPU.
 </P>
-<P>When the CPU work in a GPU pair style has finished,
-the next force computation will begin, possibly before the
-GPU has finished. If <I>split</I> is 1.0 in the gpu fix, the next
-force computation will begin almost immediately. This can
-be used to run a <A HREF = "pair_hybrid.html">hybrid</A> GPU pair style at 
-the same time as a hybrid CPU pair style. In this case, the 
-GPU pair style should be first in the hybrid command in order to
-perform simultaneous calculations. This also
-allows <A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>, 
-<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, 
-and <A HREF = "kspace_style.html">long-range</A> force
-computations to be run simultaneously with the GPU pair style.
-Once all CPU force computations have completed, the gpu fix
-will block until the GPU has finished all work before continuing
-the run.
+<P>When the CPU work in a GPU pair style has finished, the next force
+computation will begin, possibly before the GPU has finished. If
+<I>split</I> is 1.0 in the gpu fix, the next force computation will begin
+almost immediately. This can be used to run a
+<A HREF = "pair_hybrid.html">hybrid</A> GPU pair style at the same time as a hybrid
+CPU pair style. In this case, the GPU pair style should be first in
+the hybrid command in order to perform simultaneous calculations. This
+also allows <A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>,
+<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, and
+<A HREF = "kspace_style.html">long-range</A> force computations to be run
+simultaneously with the GPU pair style.  Once all CPU force
+computations have completed, the gpu fix will block until the GPU has
+finished all work before continuing the run.
 </P>
 <H4>GPU timing 
 </H4>
 <P>GPU accelerated pair styles can perform computations asynchronously
-with CPU computations. The "Pair" time reported by LAMMPS
-will be the maximum of the time required to complete the CPU
-pair style computations and the time required to complete the GPU
-pair style computations. Any time spent for GPU-enabled pair styles
-for computations that run simultaneously with <A HREF = "bond_style.html">bond</A>, 
-<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>, 
-<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A> calculations
-will not be included in the "Pair" time.
+with CPU computations. The "Pair" time reported by LAMMPS will be the
+maximum of the time required to complete the CPU pair style
+computations and the time required to complete the GPU pair style
+computations. Any time spent for GPU-enabled pair styles for
+computations that run simultaneously with <A HREF = "bond_style.html">bond</A>,
+<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
+<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
+calculations will not be included in the "Pair" time.
 </P>
-<P>When <I>mode</I> for the gpu fix is force/neigh,
-the time for neighbor list calculations on the GPU will be added
-into the "Pair" time, not the "Neigh" time. A breakdown of the
-times required for various tasks on the GPU (data copy, neighbor
-calculations, force computations, etc.) are output only
-with the LAMMPS screen output at the end of each run. These timings represent
-total time spent on the GPU for each routine, regardless of asynchronous
-CPU calculations.
+<P>When <I>mode</I> for the gpu fix is force/neigh, the time for neighbor list
+calculations on the GPU will be added into the "Pair" time, not the
+"Neigh" time. A breakdown of the times required for various tasks on
+the GPU (data copy, neighbor calculations, force computations, etc.)
+are output only with the LAMMPS screen output at the end of each
+run. These timings represent total time spent on the GPU for each
+routine, regardless of asynchronous CPU calculations.
 </P>
 <H4>GPU single vs double precision 
 </H4>
-<P>See the lammps/lib/gpu/README file for instructions on how to build 
-the LAMMPS gpu library for single, mixed, and double precision.  The latter
-requires that your GPU card supports double precision. 
+<P>See the lammps/lib/gpu/README file for instructions on how to build
+the LAMMPS gpu library for single, mixed, and double precision.  The
+latter requires that your GPU card supports double precision.
 </P>
 <HR>
 
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index 4b4d96693f..fbdd015ab4 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -984,143 +984,130 @@ processing units (GPUs).  We plan to add more over time.  Currently,
 they only support NVIDIA GPU cards.  To use them you need to install
 certain NVIDIA CUDA software on your system:
 
-Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
-Go to http://www.nvidia.com/object/cuda_get.html
-Install a driver and toolkit appropriate for your system (SDK is not necessary)
-Follow the instructions in README in lammps/lib/gpu to build the library.
-Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties :ul
+Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go
+to http://www.nvidia.com/object/cuda_get.html Install a driver and
+toolkit appropriate for your system (SDK is not necessary) Follow the
+instructions in README in lammps/lib/gpu to build the library.  Run
+lammps/lib/gpu/nvc_get_devices to list supported devices and
+properties :ul
 
 GPU configuration :h4
 
 When using GPUs, you are restricted to one physical GPU per LAMMPS
-process. Multiple processes can share a single GPU and in many cases it
-will be more efficient to run with multiple processes per GPU. Any GPU
-accelerated style requires that "fix gpu"_fix_gpu.html be used in the
-input script to select and initialize the GPUs. The format for the fix
-is:
+process. Multiple processes can share a single GPU and in many cases
+it will be more efficient to run with multiple processes per GPU. Any
+GPU accelerated style requires that "fix gpu"_fix_gpu.html be used in
+the input script to select and initialize the GPUs. The format for the
+fix is:
 
 fix {name} all gpu {mode} {first} {last} {split} :pre
 
 where {name} is the name for the fix. The gpu fix must be the first
-fix specified for a given run, otherwise the program will exit
-with an error. The gpu fix will not have any effect on runs 
-that do not use GPU acceleration; there should be no problem
-with specifying the fix first in any input script.
+fix specified for a given run, otherwise the program will exit with an
+error. The gpu fix will not have any effect on runs that do not use
+GPU acceleration; there should be no problem with specifying the fix
+first in any input script.
 
-{mode} can be either "force" or "force/neigh". In the former,
-neighbor list calculation is performed on the CPU using the
-standard LAMMPS routines. In the latter, the neighbor list
-calculation is performed on the GPU. The GPU neighbor list
-can be used for better performance, however, it 
-should not be used with a triclinic box.
+{mode} can be either "force" or "force/neigh". In the former, neighbor
+list calculation is performed on the CPU using the standard LAMMPS
+routines. In the latter, the neighbor list calculation is performed on
+the GPU. The GPU neighbor list can be used for better performance,
+however, it cannot not be used with a triclinic box or with
+"hybrid"_pair_hybrid.html pair styles.
 
-There are cases when it might be more efficient to select the CPU for neighbor
-list builds. If a non-GPU enabled style requires a neighbor list, it will also
-be built using CPU routines. Redundant CPU and GPU neighbor list calculations
-will typically be less efficient. For "hybrid"_pair_hybrid.html pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+There are cases when it might be more efficient to select the CPU for
+neighbor list builds. If a non-GPU enabled style requires a neighbor
+list, it will also be built using CPU routines. Redundant CPU and GPU
+neighbor list calculations will typically be less efficient.
 
-{first} is the ID (as reported by lammps/lib/gpu/nvc_get_devices)
-of the first GPU that will be used on each node. {last} is the
-ID of the last GPU that will be used on each node. If you have
-only one GPU per node, {first} and {last} will typically both be
-0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3)
-is not currently supported.
+{first} is the ID (as reported by lammps/lib/gpu/nvc_get_devices) of
+the first GPU that will be used on each node. {last} is the ID of the
+last GPU that will be used on each node. If you have only one GPU per
+node, {first} and {last} will typically both be 0. Selecting a
+non-sequential set of GPU IDs (e.g. 0,1,3) is not currently supported.
 
-{split} is the fraction of particles whose forces, torques,
-energies, and/or virials will be calculated on the GPU. This
-can be used to perform CPU and GPU force calculations
-simultaneously. If {split} is negative, the software will
-attempt to calculate the optimal fraction automatically 
-every 25 timesteps based on CPU and GPU timings. Because the GPU speedups
-are dependent on the number of particles, automatic calculation of the
-split can be less efficient, but typically results in loop times
-within 20% of an optimal fixed split.
+{split} is the fraction of particles whose forces, torques, energies,
+and/or virials will be calculated on the GPU. This can be used to
+perform CPU and GPU force calculations simultaneously. If {split} is
+negative, the software will attempt to calculate the optimal fraction
+automatically every 25 timesteps based on CPU and GPU timings. Because
+the GPU speedups are dependent on the number of particles, automatic
+calculation of the split can be less efficient, but typically results
+in loop times within 20% of an optimal fixed split.
 
-If you have two GPUs per node, 8 CPU cores per node, and
-would like to run on 4 nodes with dynamic balancing of
-force calculation across CPU and GPU cores, the fix
-might be
+If you have two GPUs per node, 8 CPU cores per node, and would like to
+run on 4 nodes with dynamic balancing of force calculation across CPU
+and GPU cores, the fix might be
 
 fix 0 all gpu force/neigh 0 1 -1 :pre
 
-with LAMMPS run on 32 processes. In this case, all
-CPU cores and GPU devices on the nodes would be utilized.
-Each GPU device would be shared by 4 CPU cores. The
-CPU cores would perform force calculations for some
-fraction of the particles at the same time the GPUs
-performed force calculation for the other particles.
+with LAMMPS run on 32 processes. In this case, all CPU cores and GPU
+devices on the nodes would be utilized.  Each GPU device would be
+shared by 4 CPU cores. The CPU cores would perform force calculations
+for some fraction of the particles at the same time the GPUs performed
+force calculation for the other particles.
 
-Because of the large number of cores on each GPU
-device, it might be more efficient to run on fewer
-processes per GPU when the number of particles per process
-is small (100's of particles); this can be necessary
-to keep the GPU cores busy.
+Because of the large number of cores on each GPU device, it might be
+more efficient to run on fewer processes per GPU when the number of
+particles per process is small (100's of particles); this can be
+necessary to keep the GPU cores busy.
 
 GPU input script :h4
 
-In order to use GPU acceleration in LAMMPS, 
-"fix_gpu"_fix_gpu.html
-should be used in order to initialize and configure the
-GPUs for use. Additionally, GPU enabled styles must be
-selected in the input script. Currently,
-this is limited to a few "pair styles"_pair_style.html.
-Some GPU-enabled styles have additional restrictions
-listed in their documentation.
+In order to use GPU acceleration in LAMMPS, "fix_gpu"_fix_gpu.html
+should be used in order to initialize and configure the GPUs for
+use. Additionally, GPU enabled styles must be selected in the input
+script. Currently, this is limited to a few "pair
+styles"_pair_style.html and PPPM.  Some GPU-enabled styles have
+additional restrictions listed in their documentation.
 
 GPU asynchronous pair computation :h4
 
-The GPU accelerated pair styles can be used to perform
-pair style force calculation on the GPU while other 
-calculations are
-performed on the CPU. One method to do this is to specify
-a {split} in the gpu fix as described above. In this case,
-force calculation for the pair style will also be performed
-on the CPU. 
+The GPU accelerated pair styles can be used to perform pair style
+force calculation on the GPU while other calculations are performed on
+the CPU. One method to do this is to specify a {split} in the gpu fix
+as described above.  In this case, force calculation for the pair
+style will also be performed on the CPU.
 
-When the CPU work in a GPU pair style has finished,
-the next force computation will begin, possibly before the
-GPU has finished. If {split} is 1.0 in the gpu fix, the next
-force computation will begin almost immediately. This can
-be used to run a "hybrid"_pair_hybrid.html GPU pair style at 
-the same time as a hybrid CPU pair style. In this case, the 
-GPU pair style should be first in the hybrid command in order to
-perform simultaneous calculations. This also
-allows "bond"_bond_style.html, "angle"_angle_style.html, 
-"dihedral"_dihedral_style.html, "improper"_improper_style.html, 
-and "long-range"_kspace_style.html force
-computations to be run simultaneously with the GPU pair style.
-Once all CPU force computations have completed, the gpu fix
-will block until the GPU has finished all work before continuing
-the run.
+When the CPU work in a GPU pair style has finished, the next force
+computation will begin, possibly before the GPU has finished. If
+{split} is 1.0 in the gpu fix, the next force computation will begin
+almost immediately. This can be used to run a
+"hybrid"_pair_hybrid.html GPU pair style at the same time as a hybrid
+CPU pair style. In this case, the GPU pair style should be first in
+the hybrid command in order to perform simultaneous calculations. This
+also allows "bond"_bond_style.html, "angle"_angle_style.html,
+"dihedral"_dihedral_style.html, "improper"_improper_style.html, and
+"long-range"_kspace_style.html force computations to be run
+simultaneously with the GPU pair style.  Once all CPU force
+computations have completed, the gpu fix will block until the GPU has
+finished all work before continuing the run.
 
 GPU timing :h4
 
 GPU accelerated pair styles can perform computations asynchronously
-with CPU computations. The "Pair" time reported by LAMMPS
-will be the maximum of the time required to complete the CPU
-pair style computations and the time required to complete the GPU
-pair style computations. Any time spent for GPU-enabled pair styles
-for computations that run simultaneously with "bond"_bond_style.html, 
-"angle"_angle_style.html, "dihedral"_dihedral_style.html, 
-"improper"_improper_style.html, and "long-range"_kspace_style.html calculations
-will not be included in the "Pair" time.
+with CPU computations. The "Pair" time reported by LAMMPS will be the
+maximum of the time required to complete the CPU pair style
+computations and the time required to complete the GPU pair style
+computations. Any time spent for GPU-enabled pair styles for
+computations that run simultaneously with "bond"_bond_style.html,
+"angle"_angle_style.html, "dihedral"_dihedral_style.html,
+"improper"_improper_style.html, and "long-range"_kspace_style.html
+calculations will not be included in the "Pair" time.
 
-When {mode} for the gpu fix is force/neigh,
-the time for neighbor list calculations on the GPU will be added
-into the "Pair" time, not the "Neigh" time. A breakdown of the
-times required for various tasks on the GPU (data copy, neighbor
-calculations, force computations, etc.) are output only
-with the LAMMPS screen output at the end of each run. These timings represent
-total time spent on the GPU for each routine, regardless of asynchronous
-CPU calculations.
+When {mode} for the gpu fix is force/neigh, the time for neighbor list
+calculations on the GPU will be added into the "Pair" time, not the
+"Neigh" time. A breakdown of the times required for various tasks on
+the GPU (data copy, neighbor calculations, force computations, etc.)
+are output only with the LAMMPS screen output at the end of each
+run. These timings represent total time spent on the GPU for each
+routine, regardless of asynchronous CPU calculations.
 
 GPU single vs double precision :h4
 
-See the lammps/lib/gpu/README file for instructions on how to build 
-the LAMMPS gpu library for single, mixed, and double precision.  The latter
-requires that your GPU card supports double precision. 
+See the lammps/lib/gpu/README file for instructions on how to build
+the LAMMPS gpu library for single, mixed, and double precision.  The
+latter requires that your GPU card supports double precision.
 
 :line
 
diff --git a/doc/fix_gpu.html b/doc/fix_gpu.html
index 72839bc0d1..f71a8e8a4a 100644
--- a/doc/fix_gpu.html
+++ b/doc/fix_gpu.html
@@ -48,14 +48,13 @@ should not be any problems with specifying this fix first in input scripts.
 <P><I>mode</I> specifies where neighbor list calculations will be performed.
 If <I>mode</I> is force, neighbor list calculation is performed on the
 CPU. If <I>mode</I> is force/neigh, neighbor list calculation is 
-performed on the GPU. GPU neighbor
-list calculation currently cannot be used with a triclinic box.
+performed on the GPU. GPU neighbor list calculation currently cannot be
+used with a triclinic box. GPU neighbor list calculation currently
+cannot be used with <A HREF = "pair_hybrid.html">hybrid</A> pair styles.
 GPU neighbor lists are not compatible with styles that are not GPU-enabled.
 When a non-GPU enabled style requires a neighbor list, it will also be
 built using CPU routines. In these cases, it will typically be more efficient
-to only use CPU neighbor list builds. For <A HREF = "pair_hybrid.html">hybrid</A> pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+to only use CPU neighbor list builds.
 </P>
 <P><I>first</I> and <I>last</I> specify the GPUs that will be used for simulation.
 On each node, the GPU IDs in the inclusive range from <I>first</I> to <I>last</I> will
@@ -77,7 +76,8 @@ style.
 </P>
 <P>In order to use GPU acceleration, a GPU enabled style must be
 selected in the input script in addition to this fix. Currently,
-this is limited to a few <A HREF = "pair_style.html">pair styles</A>.
+this is limited to a few <A HREF = "pair_style.html">pair styles</A> and
+the PPPM <A HREF = "kspace_style.html">kspace style</A>.
 </P>
 <P>More details about these settings and various possible hardware
 configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
@@ -95,8 +95,10 @@ the <A HREF = "run.html">run</A> command.
 <P><B>Restrictions:</B> 
 </P>
 <P>The fix must be the first fix specified for a given run. The force/neigh
-<I>mode</I> should not be used with a triclinic box or GPU-enabled pair styles
-that need <A HREF = "special_bonds.html">special_bonds</A> settings.
+<I>mode</I> should not be used with a triclinic box or <A HREF = "pair_hybrid.html">hybrid</A>
+pair styles.
+</P>
+<P><I>split</I> must be positive when using <A HREF = "pair_hybrid.html">hybrid</A> pair styles.
 </P>
 <P>Currently, group-ID must be all.
 </P>
diff --git a/doc/fix_gpu.txt b/doc/fix_gpu.txt
index 88fa6f5414..df8fbadb8f 100644
--- a/doc/fix_gpu.txt
+++ b/doc/fix_gpu.txt
@@ -39,14 +39,13 @@ should not be any problems with specifying this fix first in input scripts.
 {mode} specifies where neighbor list calculations will be performed.
 If {mode} is force, neighbor list calculation is performed on the
 CPU. If {mode} is force/neigh, neighbor list calculation is 
-performed on the GPU. GPU neighbor
-list calculation currently cannot be used with a triclinic box.
+performed on the GPU. GPU neighbor list calculation currently cannot be
+used with a triclinic box. GPU neighbor list calculation currently
+cannot be used with "hybrid"_pair_hybrid.html pair styles.
 GPU neighbor lists are not compatible with styles that are not GPU-enabled.
 When a non-GPU enabled style requires a neighbor list, it will also be
 built using CPU routines. In these cases, it will typically be more efficient
-to only use CPU neighbor list builds. For "hybrid"_pair_hybrid.html pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+to only use CPU neighbor list builds.
 
 {first} and {last} specify the GPUs that will be used for simulation.
 On each node, the GPU IDs in the inclusive range from {first} to {last} will
@@ -68,7 +67,8 @@ style.
 
 In order to use GPU acceleration, a GPU enabled style must be
 selected in the input script in addition to this fix. Currently,
-this is limited to a few "pair styles"_pair_style.html.
+this is limited to a few "pair styles"_pair_style.html and
+the PPPM "kspace style"_kspace_style.html.
 
 More details about these settings and various possible hardware
 configuration are in "this section"_Section_start.html#2_8 of the
@@ -86,8 +86,10 @@ the "run"_run.html command.
 [Restrictions:] 
 
 The fix must be the first fix specified for a given run. The force/neigh
-{mode} should not be used with a triclinic box or GPU-enabled pair styles
-that need "special_bonds"_special_bonds.html settings.
+{mode} should not be used with a triclinic box or "hybrid"_pair_hybrid.html
+pair styles.
+
+{split} must be positive when using "hybrid"_pair_hybrid.html pair styles.
 
 Currently, group-ID must be all.
 
diff --git a/doc/kspace_style.html b/doc/kspace_style.html
index 57c035f570..30b0bcbc1b 100644
--- a/doc/kspace_style.html
+++ b/doc/kspace_style.html
@@ -15,7 +15,7 @@
 </P>
 <PRE>kspace_style style value 
 </PRE>
-<UL><LI>style = <I>none</I> or <I>ewald</I> or <I>pppm</I> or <I>pppm/tip4p</I> or <I>ewald/n</I> 
+<UL><LI>style = <I>none</I> or <I>ewald</I> or <I>pppm</I> or <I>pppm/tip4p</I> or <I>ewald/n</I> or <I>pppm/gpu/single</I> or <I>pppm/gpu/double</I> 
 
 <PRE>  <I>none</I> value = none
   <I>ewald</I> value = precision
@@ -25,6 +25,10 @@
   <I>pppm/tip4p</I> value = precision
     precision = desired accuracy
   <I>ewald/n</I> value = precision
+    precision = desired accuracy
+  <I>pppm/gpu/single</I> value = precision
+    precision = desired accuracy
+  <I>pppm/gpu/double</I> value = precision
     precision = desired accuracy 
 </PRE>
 
@@ -72,6 +76,11 @@ long-range potentials.
 <P>Currently, only the <I>ewald/n</I> style can be used with non-orthogonal
 (triclinic symmetry) simulation boxes.
 </P>
+<P>The <I>pppm/gpu/single</I> and <I>pppm/gpu/double</I> styles are GPU-enabled
+version of <I>pppm</I>. See more details below.
+</P>
+<HR>
+
 <P>When a kspace style is used, a pair style that includes the
 short-range correction to the pairwise Coulombic or other 1/r^N forces
 must also be selected.  For Coulombic interactions, these styles are
@@ -88,6 +97,27 @@ of K-space vectors for style <I>ewald</I> or the FFT grid size for style
 <P>See the <A HREF = "kspace_modify.html">kspace_modify</A> command for additional
 options of the K-space solvers that can be set.
 </P>
+<HR>
+
+<P>The <I>pppm/gpu/single</I> style performs single precision
+charge assignment and force interpolation calculations on the GPU.
+The <I>pppm/gpu/double</I> style performs the mesh calculations on the GPU
+in double precision. FFT solves are calculated on the CPU in both
+cases. If either <I>pppm/gpu/single</I> or <I>pppm/gpu/double</I> are used with
+a GPU-enabled pair style, part of the PPPM calculation can be performed
+concurrently on the GPU while other calculations for non-bonded and
+bonded force calculation are performed on the CPU.
+</P>
+<P>More details about GPU settings and various possible hardware
+configurations are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with GPU-enabled 
+PPPM styles are as follows:
+</P>
+<P><A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls
+the essential GPU selection and initialization steps.
+</P>
 <P><B>Restrictions:</B>
 </P>
 <P>A simulation must be 3d and periodic in all dimensions to use an Ewald
@@ -103,6 +133,11 @@ LAMMPS</A> section for more info.
 enabled if LAMMPS was built with that package.  See the <A HREF = "Section_start.html#2_3">Making
 LAMMPS</A> section for more info.
 </P>
+<P>The <I>pppm/gpu/single</I> and <I>pppm/gpu/double</I> styles are part of the
+"gpu" package.  They are only enabled if LAMMPS was built with that
+package.  See the <A HREF = "Section_start.html#2_3">Making LAMMPS</A> section for
+more info.
+</P>
 <P>When using a long-range pairwise TIP4P potential, you must use kspace
 style <I>pppm/tip4p</I> and vice versa.
 </P>
diff --git a/doc/kspace_style.txt b/doc/kspace_style.txt
index b6b12696d2..217978c193 100644
--- a/doc/kspace_style.txt
+++ b/doc/kspace_style.txt
@@ -12,7 +12,7 @@ kspace_style command :h3
 
 kspace_style style value :pre
 
-style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} :ulb,l
+style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} or {pppm/gpu/single} or {pppm/gpu/double} :ulb,l
   {none} value = none
   {ewald} value = precision
     precision = desired accuracy
@@ -21,6 +21,10 @@ style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} :ulb,l
   {pppm/tip4p} value = precision
     precision = desired accuracy
   {ewald/n} value = precision
+    precision = desired accuracy
+  {pppm/gpu/single} value = precision
+    precision = desired accuracy
+  {pppm/gpu/double} value = precision
     precision = desired accuracy :pre
 :ule
 
@@ -67,6 +71,11 @@ long-range potentials.
 Currently, only the {ewald/n} style can be used with non-orthogonal
 (triclinic symmetry) simulation boxes.
 
+The {pppm/gpu/single} and {pppm/gpu/double} styles are GPU-enabled
+version of {pppm}. See more details below.
+
+:line
+
 When a kspace style is used, a pair style that includes the
 short-range correction to the pairwise Coulombic or other 1/r^N forces
 must also be selected.  For Coulombic interactions, these styles are
@@ -83,6 +92,27 @@ of K-space vectors for style {ewald} or the FFT grid size for style
 See the "kspace_modify"_kspace_modify.html command for additional
 options of the K-space solvers that can be set.
 
+:line
+
+The {pppm/gpu/single} style performs single precision
+charge assignment and force interpolation calculations on the GPU.
+The {pppm/gpu/double} style performs the mesh calculations on the GPU
+in double precision. FFT solves are calculated on the CPU in both
+cases. If either {pppm/gpu/single} or {pppm/gpu/double} are used with
+a GPU-enabled pair style, part of the PPPM calculation can be performed
+concurrently on the GPU while other calculations for non-bonded and
+bonded force calculation are performed on the CPU.
+
+More details about GPU settings and various possible hardware
+configurations are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with GPU-enabled 
+PPPM styles are as follows:
+
+"fix gpu"_fix_gpu.html must be used. The fix controls
+the essential GPU selection and initialization steps.
+
 [Restrictions:]
 
 A simulation must be 3d and periodic in all dimensions to use an Ewald
@@ -98,6 +128,11 @@ The {ewald/n} style is part of the "user-ewaldn" package.  It is only
 enabled if LAMMPS was built with that package.  See the "Making
 LAMMPS"_Section_start.html#2_3 section for more info.
 
+The {pppm/gpu/single} and {pppm/gpu/double} styles are part of the
+"gpu" package.  They are only enabled if LAMMPS was built with that
+package.  See the "Making LAMMPS"_Section_start.html#2_3 section for
+more info.
+
 When using a long-range pairwise TIP4P potential, you must use kspace
 style {pppm/tip4p} and vice versa.
 
diff --git a/doc/pair_coeff.html b/doc/pair_coeff.html
index fa98d3addd..0f54432555 100644
--- a/doc/pair_coeff.html
+++ b/doc/pair_coeff.html
@@ -134,6 +134,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/gpu</A> - GPU-enabled version of LJ with long-range Coulomb
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/tip4p</A> - LJ with long-range Coulomb for TIP4P water
 <LI><A HREF = "pair_lj_expand.html">pair_style lj/expand</A> - Lennard-Jones for variable size particles
+<LI><A HREF = "pair_lj_expand.html">pair_style lj/expand/gpu</A> - GPU-enabled version of lj/expand
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs</A> - GROMACS-style Lennard-Jones potential
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs/coul/gromacs</A> - GROMACS-style LJ and Coulombic potential
 <LI><A HREF = "pair_lj_smooth.html">pair_style lj/smooth</A> - smoothed Lennard-Jones potential
@@ -142,6 +143,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lubricate.html">pair_style lubricate</A> - hydrodynamic lubrication forces
 <LI><A HREF = "pair_meam.html">pair_style meam</A> - modified embedded atom method (MEAM)
 <LI><A HREF = "pair_morse.html">pair_style morse</A> - Morse potential
+<LI><A HREF = "pair_morse.html">pair_style morse/gpu</A> - GPU-enabled version of Morse potential
 <LI><A HREF = "pair_morse.html">pair_style morse/opt</A> - optimized version of Morse potential
 <LI><A HREF = "pair_peri.html">pair_style peri/lps</A> - peridynamic LPS potential
 <LI><A HREF = "pair_peri.html">pair_style peri/pmb</A> - peridynamic PMB potential
diff --git a/doc/pair_coeff.txt b/doc/pair_coeff.txt
index baf95341db..308e35329c 100644
--- a/doc/pair_coeff.txt
+++ b/doc/pair_coeff.txt
@@ -131,6 +131,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lj/cut/coul/long/gpu"_pair_lj.html - GPU-enabled version of LJ with long-range Coulomb
 "pair_style lj/cut/coul/long/tip4p"_pair_lj.html - LJ with long-range Coulomb for TIP4P water
 "pair_style lj/expand"_pair_lj_expand.html - Lennard-Jones for variable size particles
+"pair_style lj/expand/gpu"_pair_lj_expand.html - GPU-enabled version of lj/expand
 "pair_style lj/gromacs"_pair_gromacs.html - GROMACS-style Lennard-Jones potential
 "pair_style lj/gromacs/coul/gromacs"_pair_gromacs.html - GROMACS-style LJ and Coulombic potential
 "pair_style lj/smooth"_pair_lj_smooth.html - smoothed Lennard-Jones potential
@@ -139,6 +140,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lubricate"_pair_lubricate.html - hydrodynamic lubrication forces
 "pair_style meam"_pair_meam.html - modified embedded atom method (MEAM)
 "pair_style morse"_pair_morse.html - Morse potential
+"pair_style morse/gpu"_pair_morse.html - GPU-enabled version of Morse potential
 "pair_style morse/opt"_pair_morse.html - optimized version of Morse potential
 "pair_style peri/lps"_pair_peri.html - peridynamic LPS potential
 "pair_style peri/pmb"_pair_peri.html - peridynamic PMB potential
diff --git a/doc/pair_lj_expand.html b/doc/pair_lj_expand.html
index 8dfb3d2068..9e766d3f4b 100644
--- a/doc/pair_lj_expand.html
+++ b/doc/pair_lj_expand.html
@@ -11,10 +11,14 @@
 
 <H3>pair_style lj/expand command 
 </H3>
+<H3>pair_style lj/expand/gpu command 
+</H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style lj/expand cutoff 
 </PRE>
+<PRE>pair_style lj/expand/gpu cutoff 
+</PRE>
 <UL><LI>cutoff = global cutoff for lj/expand interactions (distance units) 
 </UL>
 <P><B>Examples:</B>
@@ -49,6 +53,29 @@ commands, or by mixing as described below:
 <P>The delta values can be positive or negative.  The last coefficient is
 optional.  If not specified, the global LJ cutoff is used.
 </P>
+<P>Style <I>lj/expand/gpu</I> is a GPU-enabled version of style <I>lj/expand</I>.
+See more details below.
+</P>
+<HR>
+
+<P>The <I>lj/expand/gpu</I> style is identical to the <I>lj/expand</I> style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the <A HREF = "Section_start.html#2_8">Running on GPUs</A> section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+</P>
+<P>More details about these settings and various possible hardware
+configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+</P>
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls
+the essential GPU selection and initialization steps.
+</P>
 <HR>
 
 <P><B>Mixing, shift, table, tail correction, restart, rRESPA info</B>:
@@ -80,7 +107,11 @@ to be specified in an input script that reads a restart file.
 </P>
 <HR>
 
-<P><B>Restrictions:</B> none
+<P><B>Restrictions:</B>
+</P>
+<P>The <I>lj/expand/gpu</I> style is part of the "gpu" package. It is only
+enabled if LAMMPS was built with that package.  See the <A HREF = "Section_start.html#2_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
 </P>
diff --git a/doc/pair_lj_expand.txt b/doc/pair_lj_expand.txt
index 3c82f5b944..96487df87e 100644
--- a/doc/pair_lj_expand.txt
+++ b/doc/pair_lj_expand.txt
@@ -7,10 +7,12 @@
 :line
 
 pair_style lj/expand command :h3
+pair_style lj/expand/gpu command :h3
 
 [Syntax:]
 
 pair_style lj/expand cutoff :pre
+pair_style lj/expand/gpu cutoff :pre
 
 cutoff = global cutoff for lj/expand interactions (distance units) :ul
 
@@ -46,6 +48,29 @@ cutoff (distance units) :ul
 The delta values can be positive or negative.  The last coefficient is
 optional.  If not specified, the global LJ cutoff is used.
 
+Style {lj/expand/gpu} is a GPU-enabled version of style {lj/expand}.
+See more details below.
+
+:line
+
+The {lj/expand/gpu} style is identical to the {lj/expand} style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the "Running on GPUs"_Section_start.html#2_8 section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+
+More details about these settings and various possible hardware
+configuration are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls
+the essential GPU selection and initialization steps.
+
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
@@ -77,7 +102,11 @@ This pair style can only be used via the {pair} keyword of the
 
 :line
 
-[Restrictions:] none
+[Restrictions:]
+
+The {lj/expand/gpu} style is part of the "gpu" package. It is only
+enabled if LAMMPS was built with that package.  See the "Making
+LAMMPS"_Section_start.html#2_3 section for more info.
 
 [Related commands:]
 
diff --git a/doc/pair_morse.html b/doc/pair_morse.html
index e5183ef53e..0f505c5d28 100644
--- a/doc/pair_morse.html
+++ b/doc/pair_morse.html
@@ -11,12 +11,18 @@
 
 <H3>pair_style morse command 
 </H3>
+<H3>pair_style morse/gpu command 
+</H3>
 <H3>pair_style morse/opt command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style morse cutoff 
 </PRE>
+<PRE>pair_style morse/gpu cutoff 
+</PRE>
+<PRE>pair_style morse/opt cutoff 
+</PRE>
 <UL><LI>cutoff = global cutoff for Morse interactions (distance units) 
 </UL>
 <P><B>Examples:</B>
@@ -53,6 +59,29 @@ give identical answers.  Depending on system size and the processor
 you are running on, it may be 5-25% faster (for the pairwise portion
 of the run time).
 </P>
+<P>Style <I>morse/gpu</I> is a GPU-enabled version of style <I>morse</I>.
+See more details below.
+</P>
+<HR>
+
+<P>The <I>morse/gpu</I> style is identical to the <I>morse</I> style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the <A HREF = "Section_start.html#2_8">Running on GPUs</A> section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+</P>
+<P>More details about these settings and various possible hardware
+configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+</P>
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls
+the essential GPU selection and initialization steps.
+</P>
 <HR>
 
 <P><B>Mixing, shift, table, tail correction, restart, rRESPA info</B>:
@@ -82,8 +111,9 @@ to be specified in an input script that reads a restart file.
 
 <P><B>Restrictions:</B>
 </P>
-<P>The <I>morse/opt</I> style is part of the "opt" package.  It is only
-enabled if LAMMPS was built with that package.  See the <A HREF = "Section_start.html#2_3">Making
+<P>The <I>morse/opt</I> style is part of the "opt" package.  The <I>morse/gpu</I>
+style is part of the "gpu" package. They are only
+enabled if LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#2_3">Making
 LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
diff --git a/doc/pair_morse.txt b/doc/pair_morse.txt
index 1c1799c242..8e23d84767 100644
--- a/doc/pair_morse.txt
+++ b/doc/pair_morse.txt
@@ -7,11 +7,14 @@
 :line
 
 pair_style morse command :h3
+pair_style morse/gpu command :h3
 pair_style morse/opt command :h3
 
 [Syntax:]
 
 pair_style morse cutoff :pre
+pair_style morse/gpu cutoff :pre
+pair_style morse/opt cutoff :pre
 
 cutoff = global cutoff for Morse interactions (distance units) :ul
 
@@ -49,6 +52,29 @@ give identical answers.  Depending on system size and the processor
 you are running on, it may be 5-25% faster (for the pairwise portion
 of the run time).
 
+Style {morse/gpu} is a GPU-enabled version of style {morse}.
+See more details below.
+
+:line
+
+The {morse/gpu} style is identical to the {morse} style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the "Running on GPUs"_Section_start.html#2_8 section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+
+More details about these settings and various possible hardware
+configuration are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls
+the essential GPU selection and initialization steps.
+
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
@@ -78,8 +104,9 @@ These pair styles can only be used via the {pair} keyword of the
 
 [Restrictions:]
 
-The {morse/opt} style is part of the "opt" package.  It is only
-enabled if LAMMPS was built with that package.  See the "Making
+The {morse/opt} style is part of the "opt" package.  The {morse/gpu}
+style is part of the "gpu" package. They are only
+enabled if LAMMPS was built with those packages.  See the "Making
 LAMMPS"_Section_start.html#2_3 section for more info.
 
 [Related commands:]
diff --git a/doc/pair_style.html b/doc/pair_style.html
index 450428a7bc..862a22d7cc 100644
--- a/doc/pair_style.html
+++ b/doc/pair_style.html
@@ -136,6 +136,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/gpu</A> - GPU-enabled version of LJ with long-range Coulomb
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/tip4p</A> - LJ with long-range Coulomb for TIP4P water
 <LI><A HREF = "pair_lj_expand.html">pair_style lj/expand</A> - Lennard-Jones for variable size particles
+<LI><A HREF = "pair_lj_expand.html">pair_style lj/expand/gpu</A> - GPU-enabled version of lj/expand
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs</A> - GROMACS-style Lennard-Jones potential
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs/coul/gromacs</A> - GROMACS-style LJ and Coulombic potential
 <LI><A HREF = "pair_lj_smooth.html">pair_style lj/smooth</A> - smoothed Lennard-Jones potential
@@ -144,6 +145,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lubricate.html">pair_style lubricate</A> - hydrodynamic lubrication forces
 <LI><A HREF = "pair_meam.html">pair_style meam</A> - modified embedded atom method (MEAM)
 <LI><A HREF = "pair_morse.html">pair_style morse</A> - Morse potential
+<LI><A HREF = "pair_morse.html">pair_style morse/gpu</A> - GPU-enabled version of Morse potential
 <LI><A HREF = "pair_morse.html">pair_style morse/opt</A> - optimized version of Morse potential
 <LI><A HREF = "pair_peri.html">pair_style peri/lps</A> - peridynamic LPS potential
 <LI><A HREF = "pair_peri.html">pair_style peri/pmb</A> - peridynamic PMB potential
diff --git a/doc/pair_style.txt b/doc/pair_style.txt
index 0db8457ea5..1943b32c99 100644
--- a/doc/pair_style.txt
+++ b/doc/pair_style.txt
@@ -133,6 +133,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lj/cut/coul/long/gpu"_pair_lj.html - GPU-enabled version of LJ with long-range Coulomb
 "pair_style lj/cut/coul/long/tip4p"_pair_lj.html - LJ with long-range Coulomb for TIP4P water
 "pair_style lj/expand"_pair_lj_expand.html - Lennard-Jones for variable size particles
+"pair_style lj/expand/gpu"_pair_lj_expand.html - GPU-enabled version of lj/expand
 "pair_style lj/gromacs"_pair_gromacs.html - GROMACS-style Lennard-Jones potential
 "pair_style lj/gromacs/coul/gromacs"_pair_gromacs.html - GROMACS-style LJ and Coulombic potential
 "pair_style lj/smooth"_pair_lj_smooth.html - smoothed Lennard-Jones potential
@@ -141,6 +142,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lubricate"_pair_lubricate.html - hydrodynamic lubrication forces
 "pair_style meam"_pair_meam.html - modified embedded atom method (MEAM)
 "pair_style morse"_pair_morse.html - Morse potential
+"pair_style morse/gpu"_pair_morse.html - GPU-enabled version of Morse potential
 "pair_style morse/opt"_pair_morse.html - optimized version of Morse potential
 "pair_style peri/lps"_pair_peri.html - peridynamic LPS potential
 "pair_style peri/pmb"_pair_peri.html - peridynamic PMB potential

ewald	pppm	pppm/tip4p +
ewald	pppm	pppm/gpu/single	pppm/gpu/double
pppm/tip4p
pppm GPU single and double	Mike Brown (ORNL)
pair_style lj/cut/expand	Inderaj Bains (NVIDIA)
temperature accelerated dynamics (TAD)	Aidan Thompson (Sandia)
pair reax/c and fix qeq/reax	Metin Aktulga (Purdue, now LBNL)
DREIDING force field, pair_style hbond/dreiding, etc	Tod Pascal (CalTech)
fix adapt and compute ti for thermodynamic integreation for free energies	Sai Jayaraman (Sandia)
pair born and pair gauss	Sai Jayaraman (Sandia)
stochastic rotation dynamics (SRD) via fix srd	Jemery Lechman (Sandia) and Pieter in 't Veld (BASF)
ipp Perl script tool	Reese Jones (Sandia)
eam_database and createatoms tools	Xiaowang Zhou (Sandia)
electron force field (eFF)	Andres Jaramillo-Botero and Julius Su (Caltech)