Merge pull request #1364 from stanmoore1/kk_binsize

Change defaults for KOKKOS package
2019-05-06 12:10:08 -04:00 · 2019-05-06 12:10:08 -04:00 · cc30688137
parent b3f4e2055e c11b1edc1f
commit cc30688137
9 changed files with 166 additions and 149 deletions
--- a/doc/src/Build_extras.txt
+++ b/doc/src/Build_extras.txt
@ -247,7 +247,10 @@ Maxwell50 = NVIDIA Maxwell generation CC 5.0
 Maxwell52 = NVIDIA Maxwell generation CC 5.2
 Maxwell53 = NVIDIA Maxwell generation CC 5.3
 Pascal60 = NVIDIA Pascal generation CC 6.0
-Pascal61 = NVIDIA Pascal generation CC 6.1 :ul
+Pascal61 = NVIDIA Pascal generation CC 6.1
 Volta70 = NVIDIA Volta generation CC 7.0
 Volta72 = NVIDIA Volta generation CC 7.2
 Turing75 = NVIDIA Turing generation CC 7.5 :ul
 [CMake build]:
--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@ -111,16 +111,10 @@ Makefile.kokkos_mpi_only) will give better performance than the OpenMP
 back end (i.e. Makefile.kokkos_omp) because some of the overhead to make
 the code thread-safe is removed.
-NOTE: The default for the "package kokkos"_package.html command is to
+NOTE: Use the "-pk kokkos" "command-line switch"_Run_options.html to 
-use "full" neighbor lists and set the Newton flag to "off" for both
+change the default "package kokkos"_package.html options. See its doc 
-pairwise and bonded interactions. However, when running on CPUs, it
+page for details and default settings. Experimenting with its options 
-will typically be faster to use "half" neighbor lists and set the
+can provide a speed-up for specific calculations. For example: 
 Newton flag to "on", just as is the case for non-accelerated pair
 styles. It can also be faster to use non-threaded communication.  Use
 the "-pk kokkos" "command-line switch"_Run_options.html to change the
 default "package kokkos"_package.html options. See its doc page for
 details and default settings. Experimenting with its options can
 provide a speed-up for specific calculations. For example:
 mpirun -np 16 lmp_kokkos_mpi_only -k on -sf kk -pk kokkos newton on neigh half comm no -in in.lj       # Newton on, Half neighbor list, non-threaded comm :pre
@ -190,19 +184,18 @@ tasks/node. The "-k on t Nt" command-line switch sets the number of
 threads/task as Nt. The product of these two values should be N, i.e.
 256 or 264.
-NOTE: The default for the "package kokkos"_package.html command is to
+NOTE: The default for the "package kokkos"_package.html command when 
-use "full" neighbor lists and set the Newton flag to "off" for both
+running on KNL is to use "half" neighbor lists and set the Newton flag 
-pairwise and bonded interactions. When running on KNL, this will
+to "on" for both pairwise and bonded interactions. This will typically 
-typically be best for pair-wise potentials. For many-body potentials,
+be best for many-body potentials. For simpler pair-wise potentials, it 
-using "half" neighbor lists and setting the Newton flag to "on" may be
+may be faster to use a "full" neighbor list with Newton flag to "off". 
-faster. It can also be faster to use non-threaded communication.  Use
+Use the "-pk kokkos" "command-line switch"_Run_options.html to change 
-the "-pk kokkos" "command-line switch"_Run_options.html to change the
+the default "package kokkos"_package.html options. See its doc page for 
-default "package kokkos"_package.html options. See its doc page for
+details and default settings. Experimenting with its options can provide 
-details and default settings. Experimenting with its options can
+a speed-up for specific calculations. For example: 
 provide a speed-up for specific calculations. For example:
-mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm no -in in.lj      #  Newton off, full neighbor list, non-threaded comm
+mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm host -in in.reax      #  Newton on, half neighbor list, threaded comm
-mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton on neigh half comm no -in in.reax      # Newton on, half neighbor list, non-threaded comm :pre
+mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton off neigh full comm no -in in.lj      # Newton off, full neighbor list, non-threaded comm :pre
 NOTE: MPI tasks and threads should be bound to cores as described
 above for CPUs.
@ -236,19 +229,19 @@ one or more nodes, each with two GPUs:
 mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj          # 1 node,   2 MPI tasks/node, 2 GPUs/node
 mpirun -np 32 -ppn 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj  # 16 nodes, 2 MPI tasks/node, 2 GPUs/node (32 GPUs total) :pre
-NOTE: The default for the "package kokkos"_package.html command is to
+NOTE: The default for the "package kokkos"_package.html command when 
-use "full" neighbor lists and set the Newton flag to "off" for both
+running on GPUs is to use "full" neighbor lists and set the Newton flag 
-pairwise and bonded interactions, along with threaded communication.
+to "off" for both pairwise and bonded interactions, along with threaded 
-When running on Maxwell or Kepler GPUs, this will typically be
+communication. When running on Maxwell or Kepler GPUs, this will 
-best. For Pascal GPUs, using "half" neighbor lists and setting the
+typically be best. For Pascal GPUs, using "half" neighbor lists and 
-Newton flag to "on" may be faster. For many pair styles, setting the
+setting the Newton flag to "on" may be faster. For many pair styles, 
-neighbor binsize equal to the ghost atom cutoff will give speedup.
+setting the neighbor binsize equal to twice the CPU default value will 
-Use the "-pk kokkos" "command-line switch"_Run_options.html to change
+give speedup, which is the default when running on GPUs. Use the "-pk 
-the default "package kokkos"_package.html options. See its doc page
+kokkos" "command-line switch"_Run_options.html to change the default 
-for details and default settings. Experimenting with its options can
+"package kokkos"_package.html options. See its doc page for details and 
-provide a speed-up for specific calculations. For example:
+default settings. Experimenting with its options can provide a speed-up 
 for specific calculations. For example: 
 mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos binsize 2.8 -in in.lj      # Set binsize = neighbor ghost cutoff
 mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj      # Newton on, half neighbor list, set binsize = neighbor ghost cutoff :pre
 NOTE: For good performance of the KOKKOS package on GPUs, you must
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -64,7 +64,7 @@ args = arguments specific to the style :l
      {no_affinity} values = none
  {kokkos} args = keyword value ...
    zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
+    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
      {neigh} value = {full} or {half}
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
@ -72,7 +72,7 @@ args = arguments specific to the style :l
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
      {newton} = {off} or {on}
-        off = set Newton pairwise and bonded flags off (default)
+        off = set Newton pairwise and bonded flags off
        on = set Newton pairwise and bonded flags on
      {binsize} value = size
        size = bin size for neighbor list construction (distance units)
@ -425,98 +425,100 @@ processes/threads used for LAMMPS.
 The {kokkos} style invokes settings associated with the use of the 
 KOKKOS package. 
-All of the settings are optional keyword/value pairs.  Each has a
+All of the settings are optional keyword/value pairs. Each has a default 
-default value as listed below.
+value as listed below. 
-The {neigh} keyword determines how neighbor lists are built.  A value
+The {neigh} keyword determines how neighbor lists are built. A value of 
-of {half} uses a thread-safe variant of half-neighbor lists,
+{half} uses a thread-safe variant of half-neighbor lists, the same as 
-the same as used by most pair styles in LAMMPS.
+used by most pair styles in LAMMPS, which is the default when running on 
 CPUs (i.e. the Kokkos CUDA back end is not enabled). 
-A value of {full} uses a full neighbor lists and is the default.  This
+A value of {full} uses a full neighbor lists and is the default when 
-performs twice as much computation as the {half} option, however that
+running on GPUs. This performs twice as much computation as the {half} 
-is often a win because it is thread-safe and doesn't require atomic
+option, however that is often a win because it is thread-safe and 
-operations in the calculation of pair forces.  For that reason, {full}
+doesn't require atomic operations in the calculation of pair forces. For 
-is the default setting.  However, when running in MPI-only mode with 1
+that reason, {full} is the default setting for GPUs. However, when 
-thread per MPI task, {half} neighbor lists will typically be faster,
+running on CPUs, a {half} neighbor list is the default because it are 
-just as it is for non-accelerated pair styles. Similarly, the {neigh/qeq}
+often faster, just as it is for non-accelerated pair styles. Similarly, 
-keyword determines how neighbor lists are built for "fix qeq/reax/kk"_fix_qeq_reax.html.
+the {neigh/qeq} keyword determines how neighbor lists are built for "fix 
-If not explicitly set, the value of {neigh/qeq} will match {neigh}.
+qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of 
 {neigh/qeq} will match {neigh}. 
 The {newton} keyword sets the Newton flags for pairwise and bonded 
 interactions to {off} or {on}, the same as the "newton"_newton.html 
-command allows.  The default is {off} because this will almost always
+command allows. The default for GPUs is {off} because this will almost 
-give better performance for the KOKKOS package.  This means more
+always give better performance for the KOKKOS package. This means more 
-computation is done, but less communication.  However, when running in
+computation is done, but less communication. However, when running on 
-MPI-only mode with 1 thread per MPI task, a value of {on} will
+CPUs a value of {on} is the default since it can often be faster, just 
-typically be faster, just as it is for non-accelerated pair styles.
+as it is for non-accelerated pair styles 
 The {binsize} keyword sets the size of bins used to bin atoms in 
 neighbor list builds. The same value can be set by the "neigh_modify 
-binsize"_neigh_modify.html command.  Making it an option in the
+binsize"_neigh_modify.html command. Making it an option in the package 
-package kokkos command allows it to be set from the command line.  The
+kokkos command allows it to be set from the command line. The default 
-default value is 0.0, which means the LAMMPS default will be used,
+value for CPUs is 0.0, which means the LAMMPS default will be used, 
 which is bins = 1/2 the size of the pairwise cutoff + neighbor skin 
-distance.  This is fine when neighbor lists are built on the CPU.  For
+distance. This is fine when neighbor lists are built on the CPU. For GPU 
-GPU builds, a 2x larger binsize equal to the pairwise cutoff +
+builds, a 2x larger binsize equal to the pairwise cutoff + neighbor skin 
-neighbor skin, is often faster, which can be set by this keyword.
+is often faster, which is the default. Note that if you use a 
-Note that if you use a longer-than-usual pairwise cutoff, e.g. to
+longer-than-usual pairwise cutoff, e.g. to allow for a smaller fraction 
-allow for a smaller fraction of KSpace work with a "long-range
+of KSpace work with a "long-range Coulombic solver"_kspace_style.html 
-Coulombic solver"_kspace_style.html because the GPU is faster at
+because the GPU is faster at performing pairwise interactions, then this 
-performing pairwise interactions, then this rule of thumb may give too
+rule of thumb may give too large a binsize and the default should be 
-large a binsize.
+overridden with a smaller value. 
-The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
+The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} 
-whether the host or device performs the packing and unpacking of data
+keywords determine whether the host or device performs the packing and 
-when communicating per-atom data between processors.  "Exchange"
+unpacking of data when communicating per-atom data between processors. 
-communication happens only on timesteps that neighbor lists are
+"Exchange" communication happens only on timesteps that neighbor lists 
-rebuilt.  The data is only for atoms that migrate to new processors.
+are rebuilt. The data is only for atoms that migrate to new processors. 
 "Forward" communication happens every timestep. "Reverse" communication 
-happens every timestep if the {newton} option is on.  The data is for atom
+happens every timestep if the {newton} option is on. The data is for 
-coordinates and any other atom properties that needs to be updated for
+atom coordinates and any other atom properties that needs to be updated 
-ghost atoms owned by each processor.
+for ghost atoms owned by each processor. 
-The {comm} keyword is simply a short-cut to set the same value
+The {comm} keyword is simply a short-cut to set the same value for both 
-for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
+the {comm/exchange} and {comm/forward} and {comm/reverse} keywords. 
-The value options for all 3 keywords are {no} or {host} or {device}.
+The value options for all 3 keywords are {no} or {host} or {device}. A 
-A value of {no} means to use the standard non-KOKKOS method of
+value of {no} means to use the standard non-KOKKOS method of 
-packing/unpacking data for the communication.  A value of {host} means
+packing/unpacking data for the communication. A value of {host} means to 
-to use the host, typically a multi-core CPU, and perform the
+use the host, typically a multi-core CPU, and perform the 
-packing/unpacking in parallel with threads.  A value of {device}
+packing/unpacking in parallel with threads. A value of {device} means to 
-means to use the device, typically a GPU, to perform the
+use the device, typically a GPU, to perform the packing/unpacking 
-packing/unpacking operation.
+operation. 
 The optimal choice for these keywords depends on the input script and 
 the hardware used. The {no} value is useful for verifying that the 
-Kokkos-based {host} and {device} values are working correctly.
+Kokkos-based {host} and {device} values are working correctly. It is the 
-It may also be the fastest choice when using Kokkos styles in
+default when running on CPUs since it is usually the fastest. 
 MPI-only mode (i.e. with a thread count of 1).
 When running on CPUs or Xeon Phi, the {host} and {device} values work 
-identically.  When using GPUs, the {device} value will typically be
+identically. When using GPUs, the {device} value is the default since it 
-optimal if all of your styles used in your input script are supported
+will typically be optimal if all of your styles used in your input 
-by the KOKKOS package.  In this case data can stay on the GPU for many
+script are supported by the KOKKOS package. In this case data can stay 
-timesteps without being moved between the host and GPU, if you use the
+on the GPU for many timesteps without being moved between the host and 
-{device} value.  This requires that your MPI is able to access GPU
+GPU, if you use the {device} value. This requires that your MPI is able 
-memory directly.  Currently that is true for OpenMPI 1.8 (or later
+to access GPU memory directly. Currently that is true for OpenMPI 1.8 
-versions), Mvapich2 1.9 (or later), and CrayMPI.  If your script uses
+(or later versions), Mvapich2 1.9 (or later), and CrayMPI. If your 
-styles (e.g. fixes) which are not yet supported by the KOKKOS package,
+script uses styles (e.g. fixes) which are not yet supported by the 
-then data has to be move between the host and device anyway, so it is
+KOKKOS package, then data has to be move between the host and device 
-typically faster to let the host handle communication, by using the
+anyway, so it is typically faster to let the host handle communication, 
-{host} value.  Using {host} instead of {no} will enable use of
+by using the {host} value. Using {host} instead of {no} will enable use 
-multiple threads to pack/unpack communicated data.
+of multiple threads to pack/unpack communicated data. 
 The {gpu/direct} keyword chooses whether GPU-direct will be used. When 
 this keyword is set to {on}, buffers in GPU memory are passed directly 
 through MPI send/receive calls. This reduces overhead of first copying 
 the data to the host CPU. However GPU-direct is not supported on all 
-systems, which can lead to segmentation faults and would require
+systems, which can lead to segmentation faults and would require using a 
-using a value of {off}. If LAMMPS can safely detect that GPU-direct is
+value of {off}. If LAMMPS can safely detect that GPU-direct is not 
-not available (currently only possible with OpenMPI v2.0.0 or later),
+available (currently only possible with OpenMPI v2.0.0 or later), then 
-then the {gpu/direct} keyword is automatically set to {off} by default.
+the {gpu/direct} keyword is automatically set to {off} by default. When 
-When the {gpu/direct} keyword is set to {off} while any of the {comm}
+the {gpu/direct} keyword is set to {off} while any of the {comm} 
 keywords are set to {device}, the value for these {comm} keywords will 
-be automatically changed to {host}.
+be automatically changed to {host}. This setting has no effect if not 
 running on GPUs.
 :line
@ -623,14 +625,16 @@ not used, you must invoke the package intel command in your input
 script or or via the "-pk intel" "command-line
 switch"_Run_options.html.
-For the KOKKOS package, the option defaults neigh = full, neigh/qeq =
+For the KOKKOS package, the option defaults for GPUs are neigh = full, 
-full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on.
+neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default 
-When LAMMPS can safely detect, that GPU-direct is not available, the
+value, comm = device, gpu/direct = on. When LAMMPS can safely detect 
-default value of gpu/direct becomes "off".
+that GPU-direct is not available, the default value of gpu/direct 
-These settings are made automatically by the required "-k on"
+becomes "off". For CPUs or Xeon Phis, the option defaults are neigh = 
-"command-line switch"_Run_options.html. You can change them by
+half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. These 
-using the package kokkos command in your input script or via the
+settings are made automatically by the required "-k on" "command-line 
-"-pk kokkos command-line switch"_Run_options.html.
+switch"_Run_options.html. You can change them by using the package 
 kokkos command in your input script or via the "-pk kokkos command-line 
 switch"_Run_options.html.
 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -182,16 +182,28 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  // default settings for package kokkos command
  binsize = 0.0;
  gpu_direct_flag = 1;
  if (ngpu > 0) {
    neighflag = FULL;
    neighflag_qeq = FULL;
    neighflag_qeq_set = 0;
-  exchange_comm_classic = 0;
+    newtonflag = 0;
-  forward_comm_classic = 0;
+    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-  reverse_comm_classic = 0;
+    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
-  exchange_comm_on_host = 0;
+  } else {
-  forward_comm_on_host = 0;
+    if (num_threads > 1) {
-  reverse_comm_on_host = 0;
+      neighflag = HALFTHREAD;
-  gpu_direct_flag = 1;
+      neighflag_qeq = HALFTHREAD;
    } else {
      neighflag = HALF;
      neighflag_qeq = HALF;
    }
    neighflag_qeq_set = 0;
    newtonflag = 1;
    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
  }
 #if KOKKOS_USE_CUDA
  // only if we can safely detect, that GPU-direct is not available, change default
@ -218,17 +230,6 @@ KokkosLMP::~KokkosLMP()
 void KokkosLMP::accelerator(int narg, char **arg)
 {
  // defaults
  neighflag = FULL;
  neighflag_qeq = FULL;
  neighflag_qeq_set = 0;
  int newtonflag = 0;
  double binsize = 0.0;
  exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
  exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
  gpu_direct_flag = 1;
  int iarg = 0;
  while (iarg < narg) {
    if (strcmp(arg[iarg],"neigh") == 0) {
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -36,6 +36,8 @@ class KokkosLMP : protected Pointers {
  int numa;
  int auto_sync;
  int gpu_direct_flag;
  int newtonflag;
  double binsize;
  KokkosLMP(class LAMMPS *, int, char **);
  ~KokkosLMP();
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@ -30,6 +30,7 @@
 #include "style_nstencil.h"
 #include "style_npair.h"
 #include "style_ntopo.h"
 #include "comm.h"
 using namespace LAMMPS_NS;
@ -359,6 +360,14 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){
  k_ex_mol_intra.modify<LMPHostType>();
 }
 /* ---------------------------------------------------------------------- */
 void NeighborKokkos::set_binsize_kokkos() {
  if (!binsizeflag && lmp->kokkos->ngpu > 0) {
    binsize_user = cutneighmax;
    binsizeflag = 1;
  }
 }
 /* ---------------------------------------------------------------------- */
 void NeighborKokkos::init_topology() {
--- a/src/KOKKOS/neighbor_kokkos.h
+++ b/src/KOKKOS/neighbor_kokkos.h
@ -87,6 +87,7 @@ class NeighborKokkos : public Neighbor {
  void modify_ex_group_grow_kokkos();
  void modify_mol_group_grow_kokkos();
  void modify_mol_intra_grow_kokkos();
  void set_binsize_kokkos();
 };
 }
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@ -471,6 +471,9 @@ void Neighbor::init()
    error->warning(FLERR,"Neighbor exclusions used with KSpace solver "
                   "may give inconsistent Coulombic energies");
  if (lmp->kokkos)
    set_binsize_kokkos();
  // ------------------------------------------------------------------
  // create pairwise lists
  // one-time call to init_styles() to scan style files and setup
--- a/src/neighbor.h
+++ b/src/neighbor.h
@ -233,6 +233,7 @@ class Neighbor : protected Pointers {
  virtual void init_ex_bit_kokkos() {}
  virtual void init_ex_mol_bit_kokkos() {}
  virtual void grow_ex_mol_intra_kokkos() {}
  virtual void set_binsize_kokkos() {}
 };
 namespace NeighConst {