Merge pull request #1803 from stanmoore1/kk_mpi

Fix performance issues with CUDA-aware MPI
2019-12-11 13:34:33 -07:00 · 2019-12-11 13:34:33 -07:00 · 5e4e6b2853
parent a6a354279c e0a771d5cb
commit 5e4e6b2853
3 changed files with 13 additions and 666 deletions
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -547,10 +547,10 @@ the *cuda/aware* keyword is automatically set to *off* by default. When
 the *cuda/aware* keyword is set to *off* while any of the *comm*
 keywords are set to *device*\ , the value for these *comm* keywords will
 be automatically changed to *host*\ . This setting has no effect if not
-running on GPUs. CUDA-aware MPI is available for OpenMPI 1.8 (or later
-versions), Mvapich2 1.9 (or later) when the "MV2\_USE\_CUDA" environment
-variable is set to "1", CrayMPI, and IBM Spectrum MPI when the "-gpu"
-flag is used.
+running on GPUs or if using only one MPI rank. CUDA-aware MPI is available
+for OpenMPI 1.8 (or later versions), Mvapich2 1.9 (or later) when the
+"MV2\_USE\_CUDA" environment variable is set to "1", CrayMPI, and IBM
+Spectrum MPI when the "-gpu" flag is used.


 ----------
--- a/doc/txt/package.txt
+++ b/doc/txt/package.txt
@ -1,660 +0,0 @@
-"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
-
-:link(lws,http://lammps.sandia.gov)
-:link(ld,Manual.html)
-:link(lc,Commands_all.html)
-
-:line
-
-package command :h3
-
-[Syntax:]
-
-package style args :pre
-
-style = {gpu} or {intel} or {kokkos} or {omp} :ulb,l
-args = arguments specific to the style :l
-  {gpu} args = Ngpu keyword value ...
-    Ngpu = # of GPUs per node
-    zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {newton} or {binsize} or {split} or {gpuID} or {tpa} or {device} or {blocksize}
-      {neigh} value = {yes} or {no}
-        yes = neighbor list build on GPU (default)
-        no = neighbor list build on CPU
-      {newton} = {off} or {on}
-        off = set Newton pairwise flag off (default and required)
-        on = set Newton pairwise flag on (currently not allowed)
-      {binsize} value = size
-        size = bin size for neighbor list construction (distance units)
-      {split} = fraction
-        fraction = fraction of atoms assigned to GPU (default = 1.0)
-      {gpuID} values = first last
-        first = ID of first GPU to be used on each node
-        last = ID of last GPU to be used on each node
-      {tpa} value = Nthreads
-        Nthreads = # of GPU threads used per atom
-      {device} value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
-        platform_id = numerical OpenCL platform id (default: -1)
-        device_type = {kepler} or {fermi} or {cypress} or {intel} or {phi} or {generic} or {custom}
-        val1,val2,... = custom OpenCL tune parameters (see below for details)
-      {blocksize} value = size
-        size = thread block size for pair force computation
-  {intel} args = NPhi keyword value ...
-    Nphi = # of co-processors per node
-    zero or more keyword/value pairs may be appended
-    keywords = {mode} or {omp} or {lrt} or {balance} or {ghost} or {tpc} or {tptask} or {no_affinity}
-      {mode} value = {single} or {mixed} or {double}
-        single = perform force calculations in single precision
-        mixed = perform force calculations in mixed precision
-        double = perform force calculations in double precision
-      {omp} value = Nthreads
-        Nthreads = number of OpenMP threads to use on CPU (default = 0)
-      {lrt} value = {yes} or {no}
-        yes = use additional thread dedicated for some PPPM calculations
-        no = do not dedicate an extra thread for some PPPM calculations
-      {balance} value = split
-        split = fraction of work to offload to co-processor, -1 for dynamic
-      {ghost} value = {yes} or {no}
-        yes = include ghost atoms for offload
-        no = do not include ghost atoms for offload
-      {tpc} value = Ntpc
-        Ntpc = max number of co-processor threads per co-processor core (default = 4)
-      {tptask} value = Ntptask
-        Ntptask = max number of co-processor threads per MPI task (default = 240)
-      {no_affinity} values = none
-  {kokkos} args = keyword value ...
-    zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {neigh/thread} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {cuda/aware}
-      {neigh} value = {full} or {half}
-        full = full neighbor list
-        half = half neighbor list built in thread-safe manner
-      {neigh/qeq} value = {full} or {half}
-        full = full neighbor list
-        half = half neighbor list built in thread-safe manner
-      {neigh/thread} value = {off} or {on}
-        off = thread only over atoms
-        on = thread over both atoms and neighbors
-      {newton} = {off} or {on}
-        off = set Newton pairwise and bonded flags off
-        on = set Newton pairwise and bonded flags on
-      {binsize} value = size
-        size = bin size for neighbor list construction (distance units)
-      {comm} value = {no} or {host} or {device}
-        use value for comm/exchange and comm/forward and comm/reverse
-      {comm/exchange} value = {no} or {host} or {device}
-      {comm/forward} value = {no} or {host} or {device}
-      {comm/reverse} value = {no} or {host} or {device}
-        no = perform communication pack/unpack in non-KOKKOS mode
-        host = perform pack/unpack on host (e.g. with OpenMP threading)
-        device = perform pack/unpack on device (e.g. on GPU)
-      {cuda/aware} = {off} or {on}
-        off = do not use CUDA-aware MPI
-        on = use CUDA-aware MPI (default)
-  {omp} args = Nthreads keyword value ...
-    Nthread = # of OpenMP threads to associate with each MPI process
-    zero or more keyword/value pairs may be appended
-    keywords = {neigh}
-      {neigh} value = {yes} or {no}
-        yes = threaded neighbor list build (default)
-        no = non-threaded neighbor list build :pre
-:ule
-
-[Examples:]
-
-package gpu 1
-package gpu 1 split 0.75
-package gpu 2 split -1.0
-package gpu 1 device kepler
-package gpu 1 device 2:generic
-package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
-package kokkos neigh half comm device
-package omp 0 neigh no
-package omp 4
-package intel 1
-package intel 2 omp 4 mode mixed balance 0.5 :pre
-
-[Description:]
-
-This command invokes package-specific settings for the various
-accelerator packages available in LAMMPS.  Currently the following
-packages use settings from this command: GPU, USER-INTEL, KOKKOS, and
-USER-OMP.
-
-If this command is specified in an input script, it must be near the
-top of the script, before the simulation box has been defined.  This
-is because it specifies settings that the accelerator packages use in
-their initialization, before a simulation is defined.
-
-This command can also be specified from the command-line when
-launching LAMMPS, using the "-pk" "command-line
-switch"_Run_options.html.  The syntax is exactly the same as when used
-in an input script.
-
-Note that all of the accelerator packages require the package command
-to be specified (except the OPT package), if the package is to be used
-in a simulation (LAMMPS can be built with an accelerator package
-without using it in a particular simulation).  However, in all cases,
-a default version of the command is typically invoked by other
-accelerator settings.
-
-The KOKKOS package requires a "-k on" "command-line
-switch"_Run_options.html respectively, which invokes a "package
-kokkos" command with default settings.
-
-For the GPU, USER-INTEL, and USER-OMP packages, if a "-sf gpu" or "-sf
-intel" or "-sf omp" "command-line switch"_Run_options.html is used to
-auto-append accelerator suffixes to various styles in the input
-script, then those switches also invoke a "package gpu", "package
-intel", or "package omp" command with default settings.
-
-NOTE: A package command for a particular style can be invoked multiple
-times when a simulation is setup, e.g. by the "-c on, -k on, -sf, and
-pk command-line switches"_Run_options.html, and by using this command
-in an input script.  Each time it is used all of the style options are
-set, either to default values or to specified settings.  I.e. settings
-from previous invocations do not persist across multiple invocations.
-
-See the "Speed packages"_Speed_packages.html doc page for more details
-about using the various accelerator packages for speeding up LAMMPS
-simulations.
-
-:line
-
-The {gpu} style invokes settings associated with the use of the GPU
-package.
-
-The {Ngpu} argument sets the number of GPUs per node.  There must be
-at least as many MPI tasks per node as GPUs, as set by the mpirun or
-mpiexec command.  If there are more MPI tasks (per node)
-than GPUs, multiple MPI tasks will share each GPU.
-
-Optional keyword/value pairs can also be specified.  Each has a
-default value as listed below.
-
-The {neigh} keyword specifies where neighbor lists for pair style
-computation will be built.  If {neigh} is {yes}, which is the default,
-neighbor list building is performed on the GPU.  If {neigh} is {no},
-neighbor list building is performed on the CPU.  GPU neighbor list
-building currently cannot be used with a triclinic box.  GPU neighbor
-lists are not compatible with commands that are not GPU-enabled.  When
-a non-GPU enabled command requires a neighbor list, it will also be
-built on the CPU.  In these cases, it will typically be more efficient
-to only use CPU neighbor list builds.
-
-The {newton} keyword sets the Newton flags for pairwise (not bonded)
-interactions to {off} or {on}, the same as the "newton"_newton.html
-command allows.  Currently, only an {off} value is allowed, since all
-the GPU package pair styles require this setting.  This means more
-computation is done, but less communication.  In the future a value of
-{on} may be allowed, so the {newton} keyword is included as an option
-for compatibility with the package command for other accelerator
-styles.  Note that the newton setting for bonded interactions is not
-affected by this keyword.
-
-The {binsize} keyword sets the size of bins used to bin atoms in
-neighbor list builds performed on the GPU, if {neigh} = {yes} is set.
-If {binsize} is set to 0.0 (the default), then bins = the size of the
-pairwise cutoff + neighbor skin distance.  This is 2x larger than the
-LAMMPS default used for neighbor list building on the CPU.  This will
-be close to optimal for the GPU, so you do not normally need to use
-this keyword.  Note that if you use a longer-than-usual pairwise
-cutoff, e.g. to allow for a smaller fraction of KSpace work with a
-"long-range Coulombic solver"_kspace_style.html because the GPU is
-faster at performing pairwise interactions, then it may be optimal to
-make the {binsize} smaller than the default.  For example, with a
-cutoff of 20*sigma in LJ "units"_units.html and a neighbor skin
-distance of sigma, a {binsize} = 5.25*sigma can be more efficient than
-the default.
-
-The {split} keyword can be used for load balancing force calculations
-between CPU and GPU cores in GPU-enabled pair styles. If 0 < {split} <
-1.0, a fixed fraction of particles is offloaded to the GPU while force
-calculation for the other particles occurs simultaneously on the CPU.
-If {split} < 0.0, the optimal fraction (based on CPU and GPU timings)
-is calculated every 25 timesteps, i.e. dynamic load-balancing across
-the CPU and GPU is performed.  If {split} = 1.0, all force
-calculations for GPU accelerated pair styles are performed on the GPU.
-In this case, other "hybrid"_pair_hybrid.html pair interactions,
-"bond"_bond_style.html, "angle"_angle_style.html,
-"dihedral"_dihedral_style.html, "improper"_improper_style.html, and
-"long-range"_kspace_style.html calculations can be performed on the
-CPU while the GPU is performing force calculations for the GPU-enabled
-pair style.  If all CPU force computations complete before the GPU
-completes, LAMMPS will block until the GPU has finished before
-continuing the timestep.
-
-As an example, if you have two GPUs per node and 8 CPU cores per node,
-and would like to run on 4 nodes (32 cores) with dynamic balancing of
-force calculation across CPU and GPU cores, you could specify
-
-mpirun -np 32 -sf gpu -in in.script    # launch command
-package gpu 2 split -1                 # input script command :pre
-
-In this case, all CPU cores and GPU devices on the nodes would be
-utilized.  Each GPU device would be shared by 4 CPU cores. The CPU
-cores would perform force calculations for some fraction of the
-particles at the same time the GPUs performed force calculation for
-the other particles.
-
-The {gpuID} keyword allows selection of which GPUs on each node will
-be used for a simulation.  The {first} and {last} values specify the
-GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
-Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
-of physical GPUs.  If you only wish to use a subset, set Ngpu to a
-smaller number and first/last to a sub-range of the available GPUs.
-
-The {tpa} keyword sets the number of GPU thread per atom used to
-perform force calculations.  With a default value of 1, the number of
-threads will be chosen based on the pair style, however, the value can
-be set explicitly with this keyword to fine-tune performance.  For
-large cutoffs or with a small number of particles per GPU, increasing
-the value can improve performance. The number of threads per atom must
-be a power of 2 and currently cannot be greater than 32.
-
-The {device} keyword can be used to tune parameters optimized for a
-specific accelerator and platform when using OpenCL. OpenCL supports
-the concept of a [platform], which represents one or more devices that
-share the same driver (e.g. there would be a different platform for
-GPUs from different vendors or for CPU based accelerator support).
-In LAMMPS only one platform can be active at a time and by default
-the first platform with an accelerator is selected. This is equivalent
-to using a platform ID of -1. The platform ID is a number corresponding
-to the output of the ocl_get_devices tool. The platform ID is passed
-to the GPU library, by prefixing the {device} keyword with that number
-separated by a colon. For CUDA, the {device} keyword is ignored.
-Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
-Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
-More devices may be added later.  The default device type can be
-specified when building LAMMPS with the GPU library, via setting a
-variable in the lib/gpu/Makefile that is used.
-
-In addition, a device type {custom} is available, which is followed by
-13 comma separated numbers, which allows to set those tweakable parameters
-from the package command. It can be combined with the (colon separated)
-platform id. The individual settings are:
-
-MEM_THREADS
-THREADS_PER_ATOM
-THREADS_PER_CHARGE
-BLOCK_PAIR
-MAX_SHARED_TYPES
-BLOCK_NBOR_BUILD
-BLOCK_BIO_PAIR
-BLOCK_ELLIPSE
-WARP_SIZE
-PPPM_BLOCK_1D
-BLOCK_CELL_2D
-BLOCK_CELL_ID
-MAX_BIO_SHARED_TYPES :ul
-
-The {blocksize} keyword allows you to tweak the number of threads used
-per thread block. This number should be a multiple of 32 (for GPUs)
-and its maximum depends on the specific GPU hardware. Typical choices
-are 64, 128, or 256. A larger block size increases occupancy of
-individual GPU cores, but reduces the total number of thread blocks,
-thus may lead to load imbalance.
-
-:line
-
-The {intel} style invokes settings associated with the use of the
-USER-INTEL package.  All of its settings, except the {omp} and {mode}
-keywords, are ignored if LAMMPS was not built with Xeon Phi
-co-processor support.  All of its settings, including the {omp} and
-{mode} keyword are applicable if LAMMPS was built with co-processor
-support.
-
-The {Nphi} argument sets the number of co-processors per node.
-This can be set to any value, including 0, if LAMMPS was not
-built with co-processor support.
-
-Optional keyword/value pairs can also be specified.  Each has a
-default value as listed below.
-
-The {omp} keyword determines the number of OpenMP threads allocated
-for each MPI task when any portion of the interactions computed by a
-USER-INTEL pair style are run on the CPU.  This can be the case even
-if LAMMPS was built with co-processor support; see the {balance}
-keyword discussion below.  If you are running with less MPI tasks/node
-than there are CPUs, it can be advantageous to use OpenMP threading on
-the CPUs.
-
-NOTE: The {omp} keyword has nothing to do with co-processor threads on
-the Xeon Phi; see the {tpc} and {tptask} keywords below for a
-discussion of co-processor threads.
-
-The {Nthread} value for the {omp} keyword sets the number of OpenMP
-threads allocated for each MPI task.  Setting {Nthread} = 0 (the
-default) instructs LAMMPS to use whatever value is the default for the
-given OpenMP environment. This is usually determined via the
-{OMP_NUM_THREADS} environment variable or the compiler runtime, which
-is usually a value of 1.
-
-For more details, including examples of how to set the OMP_NUM_THREADS
-environment variable, see the discussion of the {Nthreads} setting on
-this doc page for the "package omp" command.  Nthreads is a required
-argument for the USER-OMP package.  Its meaning is exactly the same
-for the USER-INTEL package.
-
-NOTE: If you build LAMMPS with both the USER-INTEL and USER-OMP
-packages, be aware that both packages allow setting of the {Nthreads}
-value via their package commands, but there is only a single global
-{Nthreads} value used by OpenMP.  Thus if both package commands are
-invoked, you should insure the two values are consistent.  If they are
-not, the last one invoked will take precedence, for both packages.
-Also note that if the "-sf hybrid intel omp command-line
-switch"_Run_options.html is used, it invokes a "package intel"
-command, followed by a "package omp" command, both with a setting of
-{Nthreads} = 0.
-
-The {mode} keyword determines the precision mode to use for
-computing pair style forces, either on the CPU or on the co-processor,
-when using a USER-INTEL supported "pair style"_pair_style.html.  It
-can take a value of {single}, {mixed} which is the default, or
-{double}.  {Single} means single precision is used for the entire
-force calculation.  {Mixed} means forces between a pair of atoms are
-computed in single precision, but accumulated and stored in double
-precision, including storage of forces, torques, energies, and virial
-quantities.  {Double} means double precision is used for the entire
-force calculation.
-
-The {lrt} keyword can be used to enable "Long Range Thread (LRT)"
-mode. It can take a value of {yes} to enable and {no} to disable.
-LRT mode generates an extra thread (in addition to any OpenMP threads
-specified with the OMP_NUM_THREADS environment variable or the {omp}
-keyword). The extra thread is dedicated for performing part of the
-"PPPM solver"_kspace_style.html computations and communications. This
-can improve parallel performance on processors supporting
-Simultaneous Multithreading (SMT) such as Hyper-Threading (HT) on Intel
-processors. In this mode, one additional thread is generated per MPI
-process. LAMMPS will generate a warning in the case that more threads
-are used than available in SMT hardware on a node. If the PPPM solver
-from the USER-INTEL package is not used, then the LRT setting is
-ignored and no extra threads are generated. Enabling LRT will replace
-the "run_style"_run_style.html with the {verlet/lrt/intel} style that
-is identical to the default {verlet} style aside from supporting the
-LRT feature. This feature requires setting the pre-processor flag
-DLMP_INTEL_USELRT in the makefile when compiling LAMMPS.
-
-The {balance} keyword sets the fraction of "pair
-style"_pair_style.html work offloaded to the co-processor for split
-values between 0.0 and 1.0 inclusive.  While this fraction of work is
-running on the co-processor, other calculations will run on the host,
-including neighbor and pair calculations that are not offloaded, as
-well as angle, bond, dihedral, kspace, and some MPI communications.
-If {split} is set to -1, the fraction of work is dynamically adjusted
-automatically throughout the run.  This typically give performance
-within 5 to 10 percent of the optimal fixed fraction.
-
-The {ghost} keyword determines whether or not ghost atoms, i.e. atoms
-at the boundaries of processor sub-domains, are offloaded for neighbor
-and force calculations.  When the value = "no", ghost atoms are not
-offloaded.  This option can reduce the amount of data transfer with
-the co-processor and can also overlap MPI communication of forces with
-computation on the co-processor when the "newton pair"_newton.html
-setting is "on".  When the value = "yes", ghost atoms are offloaded.
-In some cases this can provide better performance, especially if the
-{balance} fraction is high.
-
-The {tpc} keyword sets the max # of co-processor threads {Ntpc} that
-will run on each core of the co-processor.  The default value = 4,
-which is the number of hardware threads per core supported by the
-current generation Xeon Phi chips.
-
-The {tptask} keyword sets the max # of co-processor threads (Ntptask}
-assigned to each MPI task.  The default value = 240, which is the
-total # of threads an entire current generation Xeon Phi chip can run
-(240 = 60 cores * 4 threads/core).  This means each MPI task assigned
-to the Phi will enough threads for the chip to run the max allowed,
-even if only 1 MPI task is assigned.  If 8 MPI tasks are assigned to
-the Phi, each will run with 30 threads.  If you wish to limit the
-number of threads per MPI task, set {tptask} to a smaller value.
-E.g. for {tptask} = 16, if 8 MPI tasks are assigned, each will run
-with 16 threads, for a total of 128.
-
-Note that the default settings for {tpc} and {tptask} are fine for
-most problems, regardless of how many MPI tasks you assign to a Phi.
-
-The {no_affinity} keyword will turn off automatic setting of core
-affinity for MPI tasks and OpenMP threads on the host when using
-offload to a co-processor. Affinity settings are used when possible
-to prevent MPI tasks and OpenMP threads from being on separate NUMA
-domains and to prevent offload threads from interfering with other
-processes/threads used for LAMMPS.
-
-:line
-
-The {kokkos} style invokes settings associated with the use of the
-KOKKOS package.
-
-All of the settings are optional keyword/value pairs. Each has a default
-value as listed below.
-
-The {neigh} keyword determines how neighbor lists are built. A value of
-{half} uses a thread-safe variant of half-neighbor lists, the same as
-used by most pair styles in LAMMPS, which is the default when running on
-CPUs (i.e. the Kokkos CUDA back end is not enabled).
-
-A value of {full} uses a full neighbor lists and is the default when
-running on GPUs. This performs twice as much computation as the {half}
-option, however that is often a win because it is thread-safe and
-doesn't require atomic operations in the calculation of pair forces. For
-that reason, {full} is the default setting for GPUs. However, when
-running on CPUs, a {half} neighbor list is the default because it are
-often faster, just as it is for non-accelerated pair styles. Similarly,
-the {neigh/qeq} keyword determines how neighbor lists are built for "fix
-qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of
-{neigh/qeq} will match {neigh}.
-
-If the {neigh/thread} keyword is set to {off}, then the KOKKOS package
-threads only over atoms. However, for small systems, this may not expose
-enough parallelism to keep a GPU busy. When this keyword is set to {on},
-the KOKKOS package threads over both atoms and neighbors of atoms. When
-using {neigh/thread} {on}, a full neighbor list must also be used. Using
-{neigh/thread} {on} may be slower for large systems, so this this option
-is turned on by default only when there are 16K atoms or less owned by
-an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled
-potentials support this keyword yet, and only thread over atoms. Many
-simple pair-wise potentials such as Lennard-Jones do support threading
-over both atoms and neighbors.
-
-The {newton} keyword sets the Newton flags for pairwise and bonded
-interactions to {off} or {on}, the same as the "newton"_newton.html
-command allows. The default for GPUs is {off} because this will almost
-always give better performance for the KOKKOS package. This means more
-computation is done, but less communication. However, when running on
-CPUs a value of {on} is the default since it can often be faster, just
-as it is for non-accelerated pair styles
-
-The {binsize} keyword sets the size of bins used to bin atoms in
-neighbor list builds. The same value can be set by the "neigh_modify
-binsize"_neigh_modify.html command. Making it an option in the package
-kokkos command allows it to be set from the command line. The default
-value for CPUs is 0.0, which means the LAMMPS default will be used,
-which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
-distance. This is fine when neighbor lists are built on the CPU. For GPU
-builds, a 2x larger binsize equal to the pairwise cutoff + neighbor skin
-is often faster, which is the default. Note that if you use a
-longer-than-usual pairwise cutoff, e.g. to allow for a smaller fraction
-of KSpace work with a "long-range Coulombic solver"_kspace_style.html
-because the GPU is faster at performing pairwise interactions, then this
-rule of thumb may give too large a binsize and the default should be
-overridden with a smaller value.
-
-The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse}
-keywords determine whether the host or device performs the packing and
-unpacking of data when communicating per-atom data between processors.
-"Exchange" communication happens only on timesteps that neighbor lists
-are rebuilt. The data is only for atoms that migrate to new processors.
-"Forward" communication happens every timestep. "Reverse" communication
-happens every timestep if the {newton} option is on. The data is for
-atom coordinates and any other atom properties that needs to be updated
-for ghost atoms owned by each processor.
-
-The {comm} keyword is simply a short-cut to set the same value for both
-the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
-
-The value options for all 3 keywords are {no} or {host} or {device}. A
-value of {no} means to use the standard non-KOKKOS method of
-packing/unpacking data for the communication. A value of {host} means to
-use the host, typically a multi-core CPU, and perform the
-packing/unpacking in parallel with threads. A value of {device} means to
-use the device, typically a GPU, to perform the packing/unpacking
-operation.
-
-The optimal choice for these keywords depends on the input script and
-the hardware used. The {no} value is useful for verifying that the
-Kokkos-based {host} and {device} values are working correctly. It is the
-default when running on CPUs since it is usually the fastest.
-
-When running on CPUs or Xeon Phi, the {host} and {device} values work
-identically. When using GPUs, the {device} value is the default since it
-will typically be optimal if all of your styles used in your input
-script are supported by the KOKKOS package. In this case data can stay
-on the GPU for many timesteps without being moved between the host and
-GPU, if you use the {device} value. If your script uses styles (e.g.
-fixes) which are not yet supported by the KOKKOS package, then data has
-to be move between the host and device anyway, so it is typically faster
-to let the host handle communication, by using the {host} value. Using
-{host} instead of {no} will enable use of multiple threads to
-pack/unpack communicated data. When running small systems on a GPU,
-performing the exchange pack/unpack on the host CPU can give speedup
-since it reduces the number of CUDA kernel launches.
-
-The {cuda/aware} keyword chooses whether CUDA-aware MPI will be used. When
-this keyword is set to {on}, buffers in GPU memory are passed directly
-through MPI send/receive calls. This reduces overhead of first copying
-the data to the host CPU. However CUDA-aware MPI is not supported on all
-systems, which can lead to segmentation faults and would require using a
-value of {off}. If LAMMPS can safely detect that CUDA-aware MPI is not
-available (currently only possible with OpenMPI v2.0.0 or later), then
-the {cuda/aware} keyword is automatically set to {off} by default. When
-the {cuda/aware} keyword is set to {off} while any of the {comm}
-keywords are set to {device}, the value for these {comm} keywords will
-be automatically changed to {host}. This setting has no effect if not
-running on GPUs. CUDA-aware MPI is available for OpenMPI 1.8 (or later
-versions), Mvapich2 1.9 (or later) when the "MV2_USE_CUDA" environment
-variable is set to "1", CrayMPI, and IBM Spectrum MPI when the "-gpu"
-flag is used.
-
-:line
-
-The {omp} style invokes settings associated with the use of the
-USER-OMP package.
-
-The {Nthread} argument sets the number of OpenMP threads allocated for
-each MPI task.  For example, if your system has nodes with dual
-quad-core processors, it has a total of 8 cores per node.  You could
-use two MPI tasks per node (e.g. using the -ppn option of the mpirun
-command in MPICH or -npernode in OpenMPI), and set {Nthreads} = 4.
-This would use all 8 cores on each node.  Note that the product of MPI
-tasks * threads/task should not exceed the physical number of cores
-(on a node), otherwise performance will suffer.
-
-Setting {Nthread} = 0 instructs LAMMPS to use whatever value is the
-default for the given OpenMP environment. This is usually determined
-via the {OMP_NUM_THREADS} environment variable or the compiler
-runtime.  Note that in most cases the default for OpenMP capable
-compilers is to use one thread for each available CPU core when
-{OMP_NUM_THREADS} is not explicitly set, which can lead to poor
-performance.
-
-Here are examples of how to set the environment variable when
-launching LAMMPS:
-
-env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
-env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
-mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script :pre
-
-or you can set it permanently in your shell's start-up script.
-All three of these examples use a total of 4 CPU cores.
-
-Note that different MPI implementations have different ways of passing
-the OMP_NUM_THREADS environment variable to all MPI processes.  The
-2nd example line above is for MPICH; the 3rd example line with -x is
-for OpenMPI.  Check your MPI documentation for additional details.
-
-What combination of threads and MPI tasks gives the best performance
-is difficult to predict and can depend on many components of your
-input.  Not all features of LAMMPS support OpenMP threading via the
-USER-OMP package and the parallel efficiency can be very different,
-too.
-
-Optional keyword/value pairs can also be specified.  Each has a
-default value as listed below.
-
-The {neigh} keyword specifies whether neighbor list building will be
-multi-threaded in addition to force calculations.  If {neigh} is set
-to {no} then neighbor list calculation is performed only by MPI tasks
-with no OpenMP threading.  If {mode} is {yes} (the default), a
-multi-threaded neighbor list build is used.  Using {neigh} = {yes} is
-almost always faster and should produce identical neighbor lists at the
-expense of using more memory.  Specifically, neighbor list pages are
-allocated for all threads at the same time and each thread works
-within its own pages.
-
-:line
-
-[Restrictions:]
-
-This command cannot be used after the simulation box is defined by a
-"read_data"_read_data.html or "create_box"_create_box.html command.
-
-The gpu style of this command can only be invoked if LAMMPS was built
-with the GPU package.  See the "Build package"_Build_package.html doc
-page for more info.
-
-The intel style of this command can only be invoked if LAMMPS was
-built with the USER-INTEL package.  See the "Build
-package"_Build_package.html doc page for more info.
-
-The kk style of this command can only be invoked if LAMMPS was built
-with the KOKKOS package.  See the "Build package"_Build_package.html
-doc page for more info.
-
-The omp style of this command can only be invoked if LAMMPS was built
-with the USER-OMP package.  See the "Build package"_Build_package.html
-doc page for more info.
-
-[Related commands:]
-
-"suffix"_suffix.html, "-pk command-line switch"_Run_options.html
-
-[Default:]
-
-For the GPU package, the default is Ngpu = 1 and the option defaults
-are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
-to Ngpu-1, tpa = 1, and device = not used.  These settings are made
-automatically if the "-sf gpu" "command-line switch"_Run_options.html
-is used.  If it is not used, you must invoke the package gpu command
-in your input script or via the "-pk gpu" "command-line
-switch"_Run_options.html.
-
-For the USER-INTEL package, the default is Nphi = 1 and the option
-defaults are omp = 0, mode = mixed, lrt = no, balance = -1, tpc = 4,
-tptask = 240.  The default ghost option is determined by the pair
-style being used.  This value is output to the screen in the offload
-report at the end of each run.  Note that all of these settings,
-except "omp" and "mode", are ignored if LAMMPS was not built with Xeon
-Phi co-processor support.  These settings are made automatically if the
-"-sf intel" "command-line switch"_Run_options.html is used.  If it is
-not used, you must invoke the package intel command in your input
-script or via the "-pk intel" "command-line
-switch"_Run_options.html.
-
-For the KOKKOS package, the option defaults for GPUs are neigh = full,
-neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default
-value, comm = device, cuda/aware = on. When LAMMPS can safely detect
-that CUDA-aware MPI is not available, the default value of cuda/aware
-becomes "off". For CPUs or Xeon Phis, the option defaults are neigh =
-half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. The
-option neigh/thread = on when there are 16K atoms or less on an MPI
-rank, otherwise it is "off". These settings are made automatically by
-the required "-k on" "command-line switch"_Run_options.html. You can
-change them by using the package kokkos command in your input script or
-via the "-pk kokkos command-line switch"_Run_options.html.
-
-For the OMP package, the default is Nthreads = 0 and the option
-defaults are neigh = yes.  These settings are made automatically if
-the "-sf omp" "command-line switch"_Run_options.html is used.  If it
-is not used, you must invoke the package omp command in your input
-script or via the "-pk omp" "command-line switch"_Run_options.html.
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -220,7 +220,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)

  int nmpi = 0;
  MPI_Comm_size(world,&nmpi);
-  if (nmpi > 0) {
+  if (nmpi > 1) {

 #if defined(MPI_VERSION) && (MPI_VERSION > 2)
    // Check for IBM Spectrum MPI
@ -397,9 +397,14 @@ void KokkosLMP::accelerator(int narg, char **arg)
    } else error->all(FLERR,"Illegal package kokkos command");
  }

+#ifdef KOKKOS_ENABLE_CUDA
+
+  int nmpi = 0;
+  MPI_Comm_size(world,&nmpi);
+
  // if "cuda/aware off" and "comm device", change to "comm host"

-  if (!cuda_aware_flag) {
+  if (!cuda_aware_flag && nmpi > 1) {
    if (exchange_comm_classic == 0 && exchange_comm_on_host == 0) {
      exchange_comm_on_host = 1;
      exchange_comm_changed = 1;
@ -431,6 +436,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
    }
  }

+#endif
+
  // set newton flags
  // set neighbor binsize, same as neigh_modify command