Merge pull request #1364 from stanmoore1/kk_binsize

Change defaults for KOKKOS package
2019-05-06 12:10:08 -04:00 · 2019-05-06 12:10:08 -04:00 · cc30688137
parent b3f4e2055e c11b1edc1f
commit cc30688137
9 changed files with 166 additions and 149 deletions
--- a/doc/src/Build_extras.txt
+++ b/doc/src/Build_extras.txt
@ -247,7 +247,10 @@ Maxwell50 = NVIDIA Maxwell generation CC 5.0
 Maxwell52 = NVIDIA Maxwell generation CC 5.2
 Maxwell53 = NVIDIA Maxwell generation CC 5.3
 Pascal60 = NVIDIA Pascal generation CC 6.0
-Pascal61 = NVIDIA Pascal generation CC 6.1 :ul
+Pascal61 = NVIDIA Pascal generation CC 6.1
+Volta70 = NVIDIA Volta generation CC 7.0
+Volta72 = NVIDIA Volta generation CC 7.2
+Turing75 = NVIDIA Turing generation CC 7.5 :ul

 [CMake build]:

--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@ -111,16 +111,10 @@ Makefile.kokkos_mpi_only) will give better performance than the OpenMP
 back end (i.e. Makefile.kokkos_omp) because some of the overhead to make
 the code thread-safe is removed.

-NOTE: The default for the "package kokkos"_package.html command is to
-use "full" neighbor lists and set the Newton flag to "off" for both
-pairwise and bonded interactions. However, when running on CPUs, it
-will typically be faster to use "half" neighbor lists and set the
-Newton flag to "on", just as is the case for non-accelerated pair
-styles. It can also be faster to use non-threaded communication.  Use
-the "-pk kokkos" "command-line switch"_Run_options.html to change the
-default "package kokkos"_package.html options. See its doc page for
-details and default settings. Experimenting with its options can
-provide a speed-up for specific calculations. For example:
+NOTE: Use the "-pk kokkos" "command-line switch"_Run_options.html to 
+change the default "package kokkos"_package.html options. See its doc 
+page for details and default settings. Experimenting with its options 
+can provide a speed-up for specific calculations. For example: 

 mpirun -np 16 lmp_kokkos_mpi_only -k on -sf kk -pk kokkos newton on neigh half comm no -in in.lj       # Newton on, Half neighbor list, non-threaded comm :pre

@ -190,19 +184,18 @@ tasks/node. The "-k on t Nt" command-line switch sets the number of
 threads/task as Nt. The product of these two values should be N, i.e.
 256 or 264.

-NOTE: The default for the "package kokkos"_package.html command is to
-use "full" neighbor lists and set the Newton flag to "off" for both
-pairwise and bonded interactions. When running on KNL, this will
-typically be best for pair-wise potentials. For many-body potentials,
-using "half" neighbor lists and setting the Newton flag to "on" may be
-faster. It can also be faster to use non-threaded communication.  Use
-the "-pk kokkos" "command-line switch"_Run_options.html to change the
-default "package kokkos"_package.html options. See its doc page for
-details and default settings. Experimenting with its options can
-provide a speed-up for specific calculations. For example:
+NOTE: The default for the "package kokkos"_package.html command when 
+running on KNL is to use "half" neighbor lists and set the Newton flag 
+to "on" for both pairwise and bonded interactions. This will typically 
+be best for many-body potentials. For simpler pair-wise potentials, it 
+may be faster to use a "full" neighbor list with Newton flag to "off". 
+Use the "-pk kokkos" "command-line switch"_Run_options.html to change 
+the default "package kokkos"_package.html options. See its doc page for 
+details and default settings. Experimenting with its options can provide 
+a speed-up for specific calculations. For example: 

-mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm no -in in.lj      #  Newton off, full neighbor list, non-threaded comm
-mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton on neigh half comm no -in in.reax      # Newton on, half neighbor list, non-threaded comm :pre
+mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm host -in in.reax      #  Newton on, half neighbor list, threaded comm
+mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton off neigh full comm no -in in.lj      # Newton off, full neighbor list, non-threaded comm :pre

 NOTE: MPI tasks and threads should be bound to cores as described
 above for CPUs.
@ -236,19 +229,19 @@ one or more nodes, each with two GPUs:
 mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj          # 1 node,   2 MPI tasks/node, 2 GPUs/node
 mpirun -np 32 -ppn 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj  # 16 nodes, 2 MPI tasks/node, 2 GPUs/node (32 GPUs total) :pre

-NOTE: The default for the "package kokkos"_package.html command is to
-use "full" neighbor lists and set the Newton flag to "off" for both
-pairwise and bonded interactions, along with threaded communication.
-When running on Maxwell or Kepler GPUs, this will typically be
-best. For Pascal GPUs, using "half" neighbor lists and setting the
-Newton flag to "on" may be faster. For many pair styles, setting the
-neighbor binsize equal to the ghost atom cutoff will give speedup.
-Use the "-pk kokkos" "command-line switch"_Run_options.html to change
-the default "package kokkos"_package.html options. See its doc page
-for details and default settings. Experimenting with its options can
-provide a speed-up for specific calculations. For example:
+NOTE: The default for the "package kokkos"_package.html command when 
+running on GPUs is to use "full" neighbor lists and set the Newton flag 
+to "off" for both pairwise and bonded interactions, along with threaded 
+communication. When running on Maxwell or Kepler GPUs, this will 
+typically be best. For Pascal GPUs, using "half" neighbor lists and 
+setting the Newton flag to "on" may be faster. For many pair styles, 
+setting the neighbor binsize equal to twice the CPU default value will 
+give speedup, which is the default when running on GPUs. Use the "-pk 
+kokkos" "command-line switch"_Run_options.html to change the default 
+"package kokkos"_package.html options. See its doc page for details and 
+default settings. Experimenting with its options can provide a speed-up 
+for specific calculations. For example: 

-mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos binsize 2.8 -in in.lj      # Set binsize = neighbor ghost cutoff
 mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj      # Newton on, half neighbor list, set binsize = neighbor ghost cutoff :pre

 NOTE: For good performance of the KOKKOS package on GPUs, you must
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -64,7 +64,7 @@ args = arguments specific to the style :l
      {no_affinity} values = none
  {kokkos} args = keyword value ...
    zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
+    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
      {neigh} value = {full} or {half}
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
@ -72,7 +72,7 @@ args = arguments specific to the style :l
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
      {newton} = {off} or {on}
-        off = set Newton pairwise and bonded flags off (default)
+        off = set Newton pairwise and bonded flags off
        on = set Newton pairwise and bonded flags on
      {binsize} value = size
        size = bin size for neighbor list construction (distance units)
@ -422,101 +422,103 @@ processes/threads used for LAMMPS.

 :line

-The {kokkos} style invokes settings associated with the use of the
-KOKKOS package.
+The {kokkos} style invokes settings associated with the use of the 
+KOKKOS package. 

-All of the settings are optional keyword/value pairs.  Each has a
-default value as listed below.
+All of the settings are optional keyword/value pairs. Each has a default 
+value as listed below. 

-The {neigh} keyword determines how neighbor lists are built.  A value
-of {half} uses a thread-safe variant of half-neighbor lists,
-the same as used by most pair styles in LAMMPS.
+The {neigh} keyword determines how neighbor lists are built. A value of 
+{half} uses a thread-safe variant of half-neighbor lists, the same as 
+used by most pair styles in LAMMPS, which is the default when running on 
+CPUs (i.e. the Kokkos CUDA back end is not enabled). 

-A value of {full} uses a full neighbor lists and is the default.  This
-performs twice as much computation as the {half} option, however that
-is often a win because it is thread-safe and doesn't require atomic
-operations in the calculation of pair forces.  For that reason, {full}
-is the default setting.  However, when running in MPI-only mode with 1
-thread per MPI task, {half} neighbor lists will typically be faster,
-just as it is for non-accelerated pair styles. Similarly, the {neigh/qeq}
-keyword determines how neighbor lists are built for "fix qeq/reax/kk"_fix_qeq_reax.html.
-If not explicitly set, the value of {neigh/qeq} will match {neigh}.
+A value of {full} uses a full neighbor lists and is the default when 
+running on GPUs. This performs twice as much computation as the {half} 
+option, however that is often a win because it is thread-safe and 
+doesn't require atomic operations in the calculation of pair forces. For 
+that reason, {full} is the default setting for GPUs. However, when 
+running on CPUs, a {half} neighbor list is the default because it are 
+often faster, just as it is for non-accelerated pair styles. Similarly, 
+the {neigh/qeq} keyword determines how neighbor lists are built for "fix 
+qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of 
+{neigh/qeq} will match {neigh}. 

-The {newton} keyword sets the Newton flags for pairwise and bonded
-interactions to {off} or {on}, the same as the "newton"_newton.html
-command allows.  The default is {off} because this will almost always
-give better performance for the KOKKOS package.  This means more
-computation is done, but less communication.  However, when running in
-MPI-only mode with 1 thread per MPI task, a value of {on} will
-typically be faster, just as it is for non-accelerated pair styles.
+The {newton} keyword sets the Newton flags for pairwise and bonded 
+interactions to {off} or {on}, the same as the "newton"_newton.html 
+command allows. The default for GPUs is {off} because this will almost 
+always give better performance for the KOKKOS package. This means more 
+computation is done, but less communication. However, when running on 
+CPUs a value of {on} is the default since it can often be faster, just 
+as it is for non-accelerated pair styles 

-The {binsize} keyword sets the size of bins used to bin atoms in
-neighbor list builds.  The same value can be set by the "neigh_modify
-binsize"_neigh_modify.html command.  Making it an option in the
-package kokkos command allows it to be set from the command line.  The
-default value is 0.0, which means the LAMMPS default will be used,
-which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
-distance.  This is fine when neighbor lists are built on the CPU.  For
-GPU builds, a 2x larger binsize equal to the pairwise cutoff +
-neighbor skin, is often faster, which can be set by this keyword.
-Note that if you use a longer-than-usual pairwise cutoff, e.g. to
-allow for a smaller fraction of KSpace work with a "long-range
-Coulombic solver"_kspace_style.html because the GPU is faster at
-performing pairwise interactions, then this rule of thumb may give too
-large a binsize.
+The {binsize} keyword sets the size of bins used to bin atoms in 
+neighbor list builds. The same value can be set by the "neigh_modify 
+binsize"_neigh_modify.html command. Making it an option in the package 
+kokkos command allows it to be set from the command line. The default 
+value for CPUs is 0.0, which means the LAMMPS default will be used, 
+which is bins = 1/2 the size of the pairwise cutoff + neighbor skin 
+distance. This is fine when neighbor lists are built on the CPU. For GPU 
+builds, a 2x larger binsize equal to the pairwise cutoff + neighbor skin 
+is often faster, which is the default. Note that if you use a 
+longer-than-usual pairwise cutoff, e.g. to allow for a smaller fraction 
+of KSpace work with a "long-range Coulombic solver"_kspace_style.html 
+because the GPU is faster at performing pairwise interactions, then this 
+rule of thumb may give too large a binsize and the default should be 
+overridden with a smaller value. 

-The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
-whether the host or device performs the packing and unpacking of data
-when communicating per-atom data between processors.  "Exchange"
-communication happens only on timesteps that neighbor lists are
-rebuilt.  The data is only for atoms that migrate to new processors.
-"Forward" communication happens every timestep. "Reverse" communication
-happens every timestep if the {newton} option is on.  The data is for atom
-coordinates and any other atom properties that needs to be updated for
-ghost atoms owned by each processor.
+The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} 
+keywords determine whether the host or device performs the packing and 
+unpacking of data when communicating per-atom data between processors. 
+"Exchange" communication happens only on timesteps that neighbor lists 
+are rebuilt. The data is only for atoms that migrate to new processors. 
+"Forward" communication happens every timestep. "Reverse" communication 
+happens every timestep if the {newton} option is on. The data is for 
+atom coordinates and any other atom properties that needs to be updated 
+for ghost atoms owned by each processor. 

-The {comm} keyword is simply a short-cut to set the same value
-for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
+The {comm} keyword is simply a short-cut to set the same value for both 
+the {comm/exchange} and {comm/forward} and {comm/reverse} keywords. 

-The value options for all 3 keywords are {no} or {host} or {device}.
-A value of {no} means to use the standard non-KOKKOS method of
-packing/unpacking data for the communication.  A value of {host} means
-to use the host, typically a multi-core CPU, and perform the
-packing/unpacking in parallel with threads.  A value of {device}
-means to use the device, typically a GPU, to perform the
-packing/unpacking operation.
+The value options for all 3 keywords are {no} or {host} or {device}. A 
+value of {no} means to use the standard non-KOKKOS method of 
+packing/unpacking data for the communication. A value of {host} means to 
+use the host, typically a multi-core CPU, and perform the 
+packing/unpacking in parallel with threads. A value of {device} means to 
+use the device, typically a GPU, to perform the packing/unpacking 
+operation. 

-The optimal choice for these keywords depends on the input script and
-the hardware used.  The {no} value is useful for verifying that the
-Kokkos-based {host} and {device} values are working correctly.
-It may also be the fastest choice when using Kokkos styles in
-MPI-only mode (i.e. with a thread count of 1).
+The optimal choice for these keywords depends on the input script and 
+the hardware used. The {no} value is useful for verifying that the 
+Kokkos-based {host} and {device} values are working correctly. It is the 
+default when running on CPUs since it is usually the fastest. 

-When running on CPUs or Xeon Phi, the {host} and {device} values work
-identically.  When using GPUs, the {device} value will typically be
-optimal if all of your styles used in your input script are supported
-by the KOKKOS package.  In this case data can stay on the GPU for many
-timesteps without being moved between the host and GPU, if you use the
-{device} value.  This requires that your MPI is able to access GPU
-memory directly.  Currently that is true for OpenMPI 1.8 (or later
-versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
-styles (e.g. fixes) which are not yet supported by the KOKKOS package,
-then data has to be move between the host and device anyway, so it is
-typically faster to let the host handle communication, by using the
-{host} value.  Using {host} instead of {no} will enable use of
-multiple threads to pack/unpack communicated data.
+When running on CPUs or Xeon Phi, the {host} and {device} values work 
+identically. When using GPUs, the {device} value is the default since it 
+will typically be optimal if all of your styles used in your input 
+script are supported by the KOKKOS package. In this case data can stay 
+on the GPU for many timesteps without being moved between the host and 
+GPU, if you use the {device} value. This requires that your MPI is able 
+to access GPU memory directly. Currently that is true for OpenMPI 1.8 
+(or later versions), Mvapich2 1.9 (or later), and CrayMPI. If your 
+script uses styles (e.g. fixes) which are not yet supported by the 
+KOKKOS package, then data has to be move between the host and device 
+anyway, so it is typically faster to let the host handle communication, 
+by using the {host} value. Using {host} instead of {no} will enable use 
+of multiple threads to pack/unpack communicated data. 

-The {gpu/direct} keyword chooses whether GPU-direct will be used. When
-this keyword is set to {on}, buffers in GPU memory are passed directly
-through MPI send/receive calls. This reduces overhead of first copying
-the data to the host CPU. However GPU-direct is not supported on all
-systems, which can lead to segmentation faults and would require
-using a value of {off}. If LAMMPS can safely detect that GPU-direct is
-not available (currently only possible with OpenMPI v2.0.0 or later),
-then the {gpu/direct} keyword is automatically set to {off} by default.
-When the {gpu/direct} keyword is set to {off} while any of the {comm}
-keywords are set to {device}, the value for these {comm} keywords will
-be automatically changed to {host}.
+The {gpu/direct} keyword chooses whether GPU-direct will be used. When 
+this keyword is set to {on}, buffers in GPU memory are passed directly 
+through MPI send/receive calls. This reduces overhead of first copying 
+the data to the host CPU. However GPU-direct is not supported on all 
+systems, which can lead to segmentation faults and would require using a 
+value of {off}. If LAMMPS can safely detect that GPU-direct is not 
+available (currently only possible with OpenMPI v2.0.0 or later), then 
+the {gpu/direct} keyword is automatically set to {off} by default. When 
+the {gpu/direct} keyword is set to {off} while any of the {comm} 
+keywords are set to {device}, the value for these {comm} keywords will 
+be automatically changed to {host}. This setting has no effect if not 
+running on GPUs.

 :line

@ -623,14 +625,16 @@ not used, you must invoke the package intel command in your input
 script or or via the "-pk intel" "command-line
 switch"_Run_options.html.

-For the KOKKOS package, the option defaults neigh = full, neigh/qeq =
-full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on.
-When LAMMPS can safely detect, that GPU-direct is not available, the
-default value of gpu/direct becomes "off".
-These settings are made automatically by the required "-k on"
-"command-line switch"_Run_options.html. You can change them by
-using the package kokkos command in your input script or via the
-"-pk kokkos command-line switch"_Run_options.html.
+For the KOKKOS package, the option defaults for GPUs are neigh = full, 
+neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default 
+value, comm = device, gpu/direct = on. When LAMMPS can safely detect 
+that GPU-direct is not available, the default value of gpu/direct 
+becomes "off". For CPUs or Xeon Phis, the option defaults are neigh = 
+half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. These 
+settings are made automatically by the required "-k on" "command-line 
+switch"_Run_options.html. You can change them by using the package 
+kokkos command in your input script or via the "-pk kokkos command-line 
+switch"_Run_options.html.

 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -182,16 +182,28 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)

  // default settings for package kokkos command

-  neighflag = FULL;
-  neighflag_qeq = FULL;
-  neighflag_qeq_set = 0;
-  exchange_comm_classic = 0;
-  forward_comm_classic = 0;
-  reverse_comm_classic = 0;
-  exchange_comm_on_host = 0;
-  forward_comm_on_host = 0;
-  reverse_comm_on_host = 0;
+  binsize = 0.0;
  gpu_direct_flag = 1;
+  if (ngpu > 0) {
+    neighflag = FULL;
+    neighflag_qeq = FULL;
+    neighflag_qeq_set = 0;
+    newtonflag = 0;
+    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
+  } else {
+    if (num_threads > 1) {
+      neighflag = HALFTHREAD;
+      neighflag_qeq = HALFTHREAD;
+    } else {
+      neighflag = HALF;
+      neighflag_qeq = HALF;
+    }
+    neighflag_qeq_set = 0;
+    newtonflag = 1;
+    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
+    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
+  }

 #if KOKKOS_USE_CUDA
  // only if we can safely detect, that GPU-direct is not available, change default
@ -218,17 +230,6 @@ KokkosLMP::~KokkosLMP()

 void KokkosLMP::accelerator(int narg, char **arg)
 {
-  // defaults
-
-  neighflag = FULL;
-  neighflag_qeq = FULL;
-  neighflag_qeq_set = 0;
-  int newtonflag = 0;
-  double binsize = 0.0;
-  exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-  exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
-  gpu_direct_flag = 1;
-
  int iarg = 0;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"neigh") == 0) {
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -36,6 +36,8 @@ class KokkosLMP : protected Pointers {
  int numa;
  int auto_sync;
  int gpu_direct_flag;
+  int newtonflag;
+  double binsize;

  KokkosLMP(class LAMMPS *, int, char **);
  ~KokkosLMP();
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@ -30,6 +30,7 @@
 #include "style_nstencil.h"
 #include "style_npair.h"
 #include "style_ntopo.h"
+#include "comm.h"

 using namespace LAMMPS_NS;

@ -359,6 +360,14 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){
  k_ex_mol_intra.modify<LMPHostType>();
 }

+/* ---------------------------------------------------------------------- */
+void NeighborKokkos::set_binsize_kokkos() {
+  if (!binsizeflag && lmp->kokkos->ngpu > 0) {
+    binsize_user = cutneighmax;
+    binsizeflag = 1;
+  }
+}
+
 /* ---------------------------------------------------------------------- */

 void NeighborKokkos::init_topology() {
--- a/src/KOKKOS/neighbor_kokkos.h
+++ b/src/KOKKOS/neighbor_kokkos.h
@ -87,6 +87,7 @@ class NeighborKokkos : public Neighbor {
  void modify_ex_group_grow_kokkos();
  void modify_mol_group_grow_kokkos();
  void modify_mol_intra_grow_kokkos();
+  void set_binsize_kokkos();
 };

 }
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@ -471,6 +471,9 @@ void Neighbor::init()
    error->warning(FLERR,"Neighbor exclusions used with KSpace solver "
                   "may give inconsistent Coulombic energies");

+  if (lmp->kokkos)
+    set_binsize_kokkos();
+
  // ------------------------------------------------------------------
  // create pairwise lists
  // one-time call to init_styles() to scan style files and setup
--- a/src/neighbor.h
+++ b/src/neighbor.h
@ -233,6 +233,7 @@ class Neighbor : protected Pointers {
  virtual void init_ex_bit_kokkos() {}
  virtual void init_ex_mol_bit_kokkos() {}
  virtual void grow_ex_mol_intra_kokkos() {}
+  virtual void set_binsize_kokkos() {}
 };

 namespace NeighConst {