Merge pull request #1364 from stanmoore1/kk_binsize

Change defaults for KOKKOS package
This commit is contained in:
Axel Kohlmeyer 2019-05-06 12:10:08 -04:00 committed by GitHub
commit cc30688137
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 166 additions and 149 deletions

View File

@ -247,7 +247,10 @@ Maxwell50 = NVIDIA Maxwell generation CC 5.0
Maxwell52 = NVIDIA Maxwell generation CC 5.2
Maxwell53 = NVIDIA Maxwell generation CC 5.3
Pascal60 = NVIDIA Pascal generation CC 6.0
Pascal61 = NVIDIA Pascal generation CC 6.1 :ul
Pascal61 = NVIDIA Pascal generation CC 6.1
Volta70 = NVIDIA Volta generation CC 7.0
Volta72 = NVIDIA Volta generation CC 7.2
Turing75 = NVIDIA Turing generation CC 7.5 :ul
[CMake build]:

View File

@ -111,16 +111,10 @@ Makefile.kokkos_mpi_only) will give better performance than the OpenMP
back end (i.e. Makefile.kokkos_omp) because some of the overhead to make
the code thread-safe is removed.
NOTE: The default for the "package kokkos"_package.html command is to
use "full" neighbor lists and set the Newton flag to "off" for both
pairwise and bonded interactions. However, when running on CPUs, it
will typically be faster to use "half" neighbor lists and set the
Newton flag to "on", just as is the case for non-accelerated pair
styles. It can also be faster to use non-threaded communication. Use
the "-pk kokkos" "command-line switch"_Run_options.html to change the
default "package kokkos"_package.html options. See its doc page for
details and default settings. Experimenting with its options can
provide a speed-up for specific calculations. For example:
NOTE: Use the "-pk kokkos" "command-line switch"_Run_options.html to
change the default "package kokkos"_package.html options. See its doc
page for details and default settings. Experimenting with its options
can provide a speed-up for specific calculations. For example:
mpirun -np 16 lmp_kokkos_mpi_only -k on -sf kk -pk kokkos newton on neigh half comm no -in in.lj # Newton on, Half neighbor list, non-threaded comm :pre
@ -190,19 +184,18 @@ tasks/node. The "-k on t Nt" command-line switch sets the number of
threads/task as Nt. The product of these two values should be N, i.e.
256 or 264.
NOTE: The default for the "package kokkos"_package.html command is to
use "full" neighbor lists and set the Newton flag to "off" for both
pairwise and bonded interactions. When running on KNL, this will
typically be best for pair-wise potentials. For many-body potentials,
using "half" neighbor lists and setting the Newton flag to "on" may be
faster. It can also be faster to use non-threaded communication. Use
the "-pk kokkos" "command-line switch"_Run_options.html to change the
default "package kokkos"_package.html options. See its doc page for
details and default settings. Experimenting with its options can
provide a speed-up for specific calculations. For example:
NOTE: The default for the "package kokkos"_package.html command when
running on KNL is to use "half" neighbor lists and set the Newton flag
to "on" for both pairwise and bonded interactions. This will typically
be best for many-body potentials. For simpler pair-wise potentials, it
may be faster to use a "full" neighbor list with Newton flag to "off".
Use the "-pk kokkos" "command-line switch"_Run_options.html to change
the default "package kokkos"_package.html options. See its doc page for
details and default settings. Experimenting with its options can provide
a speed-up for specific calculations. For example:
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm no -in in.lj # Newton off, full neighbor list, non-threaded comm
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton on neigh half comm no -in in.reax # Newton on, half neighbor list, non-threaded comm :pre
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm host -in in.reax # Newton on, half neighbor list, threaded comm
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton off neigh full comm no -in in.lj # Newton off, full neighbor list, non-threaded comm :pre
NOTE: MPI tasks and threads should be bound to cores as described
above for CPUs.
@ -236,19 +229,19 @@ one or more nodes, each with two GPUs:
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj # 1 node, 2 MPI tasks/node, 2 GPUs/node
mpirun -np 32 -ppn 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj # 16 nodes, 2 MPI tasks/node, 2 GPUs/node (32 GPUs total) :pre
NOTE: The default for the "package kokkos"_package.html command is to
use "full" neighbor lists and set the Newton flag to "off" for both
pairwise and bonded interactions, along with threaded communication.
When running on Maxwell or Kepler GPUs, this will typically be
best. For Pascal GPUs, using "half" neighbor lists and setting the
Newton flag to "on" may be faster. For many pair styles, setting the
neighbor binsize equal to the ghost atom cutoff will give speedup.
Use the "-pk kokkos" "command-line switch"_Run_options.html to change
the default "package kokkos"_package.html options. See its doc page
for details and default settings. Experimenting with its options can
provide a speed-up for specific calculations. For example:
NOTE: The default for the "package kokkos"_package.html command when
running on GPUs is to use "full" neighbor lists and set the Newton flag
to "off" for both pairwise and bonded interactions, along with threaded
communication. When running on Maxwell or Kepler GPUs, this will
typically be best. For Pascal GPUs, using "half" neighbor lists and
setting the Newton flag to "on" may be faster. For many pair styles,
setting the neighbor binsize equal to twice the CPU default value will
give speedup, which is the default when running on GPUs. Use the "-pk
kokkos" "command-line switch"_Run_options.html to change the default
"package kokkos"_package.html options. See its doc page for details and
default settings. Experimenting with its options can provide a speed-up
for specific calculations. For example:
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos binsize 2.8 -in in.lj # Set binsize = neighbor ghost cutoff
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj # Newton on, half neighbor list, set binsize = neighbor ghost cutoff :pre
NOTE: For good performance of the KOKKOS package on GPUs, you must

View File

@ -64,7 +64,7 @@ args = arguments specific to the style :l
{no_affinity} values = none
{kokkos} args = keyword value ...
zero or more keyword/value pairs may be appended
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
{neigh} value = {full} or {half}
full = full neighbor list
half = half neighbor list built in thread-safe manner
@ -72,7 +72,7 @@ args = arguments specific to the style :l
full = full neighbor list
half = half neighbor list built in thread-safe manner
{newton} = {off} or {on}
off = set Newton pairwise and bonded flags off (default)
off = set Newton pairwise and bonded flags off
on = set Newton pairwise and bonded flags on
{binsize} value = size
size = bin size for neighbor list construction (distance units)
@ -425,98 +425,100 @@ processes/threads used for LAMMPS.
The {kokkos} style invokes settings associated with the use of the
KOKKOS package.
All of the settings are optional keyword/value pairs. Each has a
default value as listed below.
All of the settings are optional keyword/value pairs. Each has a default
value as listed below.
The {neigh} keyword determines how neighbor lists are built. A value
of {half} uses a thread-safe variant of half-neighbor lists,
the same as used by most pair styles in LAMMPS.
The {neigh} keyword determines how neighbor lists are built. A value of
{half} uses a thread-safe variant of half-neighbor lists, the same as
used by most pair styles in LAMMPS, which is the default when running on
CPUs (i.e. the Kokkos CUDA back end is not enabled).
A value of {full} uses a full neighbor lists and is the default. This
performs twice as much computation as the {half} option, however that
is often a win because it is thread-safe and doesn't require atomic
operations in the calculation of pair forces. For that reason, {full}
is the default setting. However, when running in MPI-only mode with 1
thread per MPI task, {half} neighbor lists will typically be faster,
just as it is for non-accelerated pair styles. Similarly, the {neigh/qeq}
keyword determines how neighbor lists are built for "fix qeq/reax/kk"_fix_qeq_reax.html.
If not explicitly set, the value of {neigh/qeq} will match {neigh}.
A value of {full} uses a full neighbor lists and is the default when
running on GPUs. This performs twice as much computation as the {half}
option, however that is often a win because it is thread-safe and
doesn't require atomic operations in the calculation of pair forces. For
that reason, {full} is the default setting for GPUs. However, when
running on CPUs, a {half} neighbor list is the default because it are
often faster, just as it is for non-accelerated pair styles. Similarly,
the {neigh/qeq} keyword determines how neighbor lists are built for "fix
qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of
{neigh/qeq} will match {neigh}.
The {newton} keyword sets the Newton flags for pairwise and bonded
interactions to {off} or {on}, the same as the "newton"_newton.html
command allows. The default is {off} because this will almost always
give better performance for the KOKKOS package. This means more
computation is done, but less communication. However, when running in
MPI-only mode with 1 thread per MPI task, a value of {on} will
typically be faster, just as it is for non-accelerated pair styles.
command allows. The default for GPUs is {off} because this will almost
always give better performance for the KOKKOS package. This means more
computation is done, but less communication. However, when running on
CPUs a value of {on} is the default since it can often be faster, just
as it is for non-accelerated pair styles
The {binsize} keyword sets the size of bins used to bin atoms in
neighbor list builds. The same value can be set by the "neigh_modify
binsize"_neigh_modify.html command. Making it an option in the
package kokkos command allows it to be set from the command line. The
default value is 0.0, which means the LAMMPS default will be used,
binsize"_neigh_modify.html command. Making it an option in the package
kokkos command allows it to be set from the command line. The default
value for CPUs is 0.0, which means the LAMMPS default will be used,
which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
distance. This is fine when neighbor lists are built on the CPU. For
GPU builds, a 2x larger binsize equal to the pairwise cutoff +
neighbor skin, is often faster, which can be set by this keyword.
Note that if you use a longer-than-usual pairwise cutoff, e.g. to
allow for a smaller fraction of KSpace work with a "long-range
Coulombic solver"_kspace_style.html because the GPU is faster at
performing pairwise interactions, then this rule of thumb may give too
large a binsize.
distance. This is fine when neighbor lists are built on the CPU. For GPU
builds, a 2x larger binsize equal to the pairwise cutoff + neighbor skin
is often faster, which is the default. Note that if you use a
longer-than-usual pairwise cutoff, e.g. to allow for a smaller fraction
of KSpace work with a "long-range Coulombic solver"_kspace_style.html
because the GPU is faster at performing pairwise interactions, then this
rule of thumb may give too large a binsize and the default should be
overridden with a smaller value.
The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
whether the host or device performs the packing and unpacking of data
when communicating per-atom data between processors. "Exchange"
communication happens only on timesteps that neighbor lists are
rebuilt. The data is only for atoms that migrate to new processors.
The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse}
keywords determine whether the host or device performs the packing and
unpacking of data when communicating per-atom data between processors.
"Exchange" communication happens only on timesteps that neighbor lists
are rebuilt. The data is only for atoms that migrate to new processors.
"Forward" communication happens every timestep. "Reverse" communication
happens every timestep if the {newton} option is on. The data is for atom
coordinates and any other atom properties that needs to be updated for
ghost atoms owned by each processor.
happens every timestep if the {newton} option is on. The data is for
atom coordinates and any other atom properties that needs to be updated
for ghost atoms owned by each processor.
The {comm} keyword is simply a short-cut to set the same value
for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
The {comm} keyword is simply a short-cut to set the same value for both
the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
The value options for all 3 keywords are {no} or {host} or {device}.
A value of {no} means to use the standard non-KOKKOS method of
packing/unpacking data for the communication. A value of {host} means
to use the host, typically a multi-core CPU, and perform the
packing/unpacking in parallel with threads. A value of {device}
means to use the device, typically a GPU, to perform the
packing/unpacking operation.
The value options for all 3 keywords are {no} or {host} or {device}. A
value of {no} means to use the standard non-KOKKOS method of
packing/unpacking data for the communication. A value of {host} means to
use the host, typically a multi-core CPU, and perform the
packing/unpacking in parallel with threads. A value of {device} means to
use the device, typically a GPU, to perform the packing/unpacking
operation.
The optimal choice for these keywords depends on the input script and
the hardware used. The {no} value is useful for verifying that the
Kokkos-based {host} and {device} values are working correctly.
It may also be the fastest choice when using Kokkos styles in
MPI-only mode (i.e. with a thread count of 1).
Kokkos-based {host} and {device} values are working correctly. It is the
default when running on CPUs since it is usually the fastest.
When running on CPUs or Xeon Phi, the {host} and {device} values work
identically. When using GPUs, the {device} value will typically be
optimal if all of your styles used in your input script are supported
by the KOKKOS package. In this case data can stay on the GPU for many
timesteps without being moved between the host and GPU, if you use the
{device} value. This requires that your MPI is able to access GPU
memory directly. Currently that is true for OpenMPI 1.8 (or later
versions), Mvapich2 1.9 (or later), and CrayMPI. If your script uses
styles (e.g. fixes) which are not yet supported by the KOKKOS package,
then data has to be move between the host and device anyway, so it is
typically faster to let the host handle communication, by using the
{host} value. Using {host} instead of {no} will enable use of
multiple threads to pack/unpack communicated data.
identically. When using GPUs, the {device} value is the default since it
will typically be optimal if all of your styles used in your input
script are supported by the KOKKOS package. In this case data can stay
on the GPU for many timesteps without being moved between the host and
GPU, if you use the {device} value. This requires that your MPI is able
to access GPU memory directly. Currently that is true for OpenMPI 1.8
(or later versions), Mvapich2 1.9 (or later), and CrayMPI. If your
script uses styles (e.g. fixes) which are not yet supported by the
KOKKOS package, then data has to be move between the host and device
anyway, so it is typically faster to let the host handle communication,
by using the {host} value. Using {host} instead of {no} will enable use
of multiple threads to pack/unpack communicated data.
The {gpu/direct} keyword chooses whether GPU-direct will be used. When
this keyword is set to {on}, buffers in GPU memory are passed directly
through MPI send/receive calls. This reduces overhead of first copying
the data to the host CPU. However GPU-direct is not supported on all
systems, which can lead to segmentation faults and would require
using a value of {off}. If LAMMPS can safely detect that GPU-direct is
not available (currently only possible with OpenMPI v2.0.0 or later),
then the {gpu/direct} keyword is automatically set to {off} by default.
When the {gpu/direct} keyword is set to {off} while any of the {comm}
systems, which can lead to segmentation faults and would require using a
value of {off}. If LAMMPS can safely detect that GPU-direct is not
available (currently only possible with OpenMPI v2.0.0 or later), then
the {gpu/direct} keyword is automatically set to {off} by default. When
the {gpu/direct} keyword is set to {off} while any of the {comm}
keywords are set to {device}, the value for these {comm} keywords will
be automatically changed to {host}.
be automatically changed to {host}. This setting has no effect if not
running on GPUs.
:line
@ -623,14 +625,16 @@ not used, you must invoke the package intel command in your input
script or or via the "-pk intel" "command-line
switch"_Run_options.html.
For the KOKKOS package, the option defaults neigh = full, neigh/qeq =
full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on.
When LAMMPS can safely detect, that GPU-direct is not available, the
default value of gpu/direct becomes "off".
These settings are made automatically by the required "-k on"
"command-line switch"_Run_options.html. You can change them by
using the package kokkos command in your input script or via the
"-pk kokkos command-line switch"_Run_options.html.
For the KOKKOS package, the option defaults for GPUs are neigh = full,
neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default
value, comm = device, gpu/direct = on. When LAMMPS can safely detect
that GPU-direct is not available, the default value of gpu/direct
becomes "off". For CPUs or Xeon Phis, the option defaults are neigh =
half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. These
settings are made automatically by the required "-k on" "command-line
switch"_Run_options.html. You can change them by using the package
kokkos command in your input script or via the "-pk kokkos command-line
switch"_Run_options.html.
For the OMP package, the default is Nthreads = 0 and the option
defaults are neigh = yes. These settings are made automatically if

View File

@ -182,16 +182,28 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
// default settings for package kokkos command
binsize = 0.0;
gpu_direct_flag = 1;
if (ngpu > 0) {
neighflag = FULL;
neighflag_qeq = FULL;
neighflag_qeq_set = 0;
exchange_comm_classic = 0;
forward_comm_classic = 0;
reverse_comm_classic = 0;
exchange_comm_on_host = 0;
forward_comm_on_host = 0;
reverse_comm_on_host = 0;
gpu_direct_flag = 1;
newtonflag = 0;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
} else {
if (num_threads > 1) {
neighflag = HALFTHREAD;
neighflag_qeq = HALFTHREAD;
} else {
neighflag = HALF;
neighflag_qeq = HALF;
}
neighflag_qeq_set = 0;
newtonflag = 1;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
}
#if KOKKOS_USE_CUDA
// only if we can safely detect, that GPU-direct is not available, change default
@ -218,17 +230,6 @@ KokkosLMP::~KokkosLMP()
void KokkosLMP::accelerator(int narg, char **arg)
{
// defaults
neighflag = FULL;
neighflag_qeq = FULL;
neighflag_qeq_set = 0;
int newtonflag = 0;
double binsize = 0.0;
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
gpu_direct_flag = 1;
int iarg = 0;
while (iarg < narg) {
if (strcmp(arg[iarg],"neigh") == 0) {

View File

@ -36,6 +36,8 @@ class KokkosLMP : protected Pointers {
int numa;
int auto_sync;
int gpu_direct_flag;
int newtonflag;
double binsize;
KokkosLMP(class LAMMPS *, int, char **);
~KokkosLMP();

View File

@ -30,6 +30,7 @@
#include "style_nstencil.h"
#include "style_npair.h"
#include "style_ntopo.h"
#include "comm.h"
using namespace LAMMPS_NS;
@ -359,6 +360,14 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){
k_ex_mol_intra.modify<LMPHostType>();
}
/* ---------------------------------------------------------------------- */
void NeighborKokkos::set_binsize_kokkos() {
if (!binsizeflag && lmp->kokkos->ngpu > 0) {
binsize_user = cutneighmax;
binsizeflag = 1;
}
}
/* ---------------------------------------------------------------------- */
void NeighborKokkos::init_topology() {

View File

@ -87,6 +87,7 @@ class NeighborKokkos : public Neighbor {
void modify_ex_group_grow_kokkos();
void modify_mol_group_grow_kokkos();
void modify_mol_intra_grow_kokkos();
void set_binsize_kokkos();
};
}

View File

@ -471,6 +471,9 @@ void Neighbor::init()
error->warning(FLERR,"Neighbor exclusions used with KSpace solver "
"may give inconsistent Coulombic energies");
if (lmp->kokkos)
set_binsize_kokkos();
// ------------------------------------------------------------------
// create pairwise lists
// one-time call to init_styles() to scan style files and setup

View File

@ -233,6 +233,7 @@ class Neighbor : protected Pointers {
virtual void init_ex_bit_kokkos() {}
virtual void init_ex_mol_bit_kokkos() {}
virtual void grow_ex_mol_intra_kokkos() {}
virtual void set_binsize_kokkos() {}
};
namespace NeighConst {