forked from lijiext/lammps
Merge pull request #1364 from stanmoore1/kk_binsize
Change defaults for KOKKOS package
This commit is contained in:
commit
cc30688137
|
@ -247,7 +247,10 @@ Maxwell50 = NVIDIA Maxwell generation CC 5.0
|
||||||
Maxwell52 = NVIDIA Maxwell generation CC 5.2
|
Maxwell52 = NVIDIA Maxwell generation CC 5.2
|
||||||
Maxwell53 = NVIDIA Maxwell generation CC 5.3
|
Maxwell53 = NVIDIA Maxwell generation CC 5.3
|
||||||
Pascal60 = NVIDIA Pascal generation CC 6.0
|
Pascal60 = NVIDIA Pascal generation CC 6.0
|
||||||
Pascal61 = NVIDIA Pascal generation CC 6.1 :ul
|
Pascal61 = NVIDIA Pascal generation CC 6.1
|
||||||
|
Volta70 = NVIDIA Volta generation CC 7.0
|
||||||
|
Volta72 = NVIDIA Volta generation CC 7.2
|
||||||
|
Turing75 = NVIDIA Turing generation CC 7.5 :ul
|
||||||
|
|
||||||
[CMake build]:
|
[CMake build]:
|
||||||
|
|
||||||
|
|
|
@ -111,16 +111,10 @@ Makefile.kokkos_mpi_only) will give better performance than the OpenMP
|
||||||
back end (i.e. Makefile.kokkos_omp) because some of the overhead to make
|
back end (i.e. Makefile.kokkos_omp) because some of the overhead to make
|
||||||
the code thread-safe is removed.
|
the code thread-safe is removed.
|
||||||
|
|
||||||
NOTE: The default for the "package kokkos"_package.html command is to
|
NOTE: Use the "-pk kokkos" "command-line switch"_Run_options.html to
|
||||||
use "full" neighbor lists and set the Newton flag to "off" for both
|
change the default "package kokkos"_package.html options. See its doc
|
||||||
pairwise and bonded interactions. However, when running on CPUs, it
|
page for details and default settings. Experimenting with its options
|
||||||
will typically be faster to use "half" neighbor lists and set the
|
can provide a speed-up for specific calculations. For example:
|
||||||
Newton flag to "on", just as is the case for non-accelerated pair
|
|
||||||
styles. It can also be faster to use non-threaded communication. Use
|
|
||||||
the "-pk kokkos" "command-line switch"_Run_options.html to change the
|
|
||||||
default "package kokkos"_package.html options. See its doc page for
|
|
||||||
details and default settings. Experimenting with its options can
|
|
||||||
provide a speed-up for specific calculations. For example:
|
|
||||||
|
|
||||||
mpirun -np 16 lmp_kokkos_mpi_only -k on -sf kk -pk kokkos newton on neigh half comm no -in in.lj # Newton on, Half neighbor list, non-threaded comm :pre
|
mpirun -np 16 lmp_kokkos_mpi_only -k on -sf kk -pk kokkos newton on neigh half comm no -in in.lj # Newton on, Half neighbor list, non-threaded comm :pre
|
||||||
|
|
||||||
|
@ -190,19 +184,18 @@ tasks/node. The "-k on t Nt" command-line switch sets the number of
|
||||||
threads/task as Nt. The product of these two values should be N, i.e.
|
threads/task as Nt. The product of these two values should be N, i.e.
|
||||||
256 or 264.
|
256 or 264.
|
||||||
|
|
||||||
NOTE: The default for the "package kokkos"_package.html command is to
|
NOTE: The default for the "package kokkos"_package.html command when
|
||||||
use "full" neighbor lists and set the Newton flag to "off" for both
|
running on KNL is to use "half" neighbor lists and set the Newton flag
|
||||||
pairwise and bonded interactions. When running on KNL, this will
|
to "on" for both pairwise and bonded interactions. This will typically
|
||||||
typically be best for pair-wise potentials. For many-body potentials,
|
be best for many-body potentials. For simpler pair-wise potentials, it
|
||||||
using "half" neighbor lists and setting the Newton flag to "on" may be
|
may be faster to use a "full" neighbor list with Newton flag to "off".
|
||||||
faster. It can also be faster to use non-threaded communication. Use
|
Use the "-pk kokkos" "command-line switch"_Run_options.html to change
|
||||||
the "-pk kokkos" "command-line switch"_Run_options.html to change the
|
the default "package kokkos"_package.html options. See its doc page for
|
||||||
default "package kokkos"_package.html options. See its doc page for
|
details and default settings. Experimenting with its options can provide
|
||||||
details and default settings. Experimenting with its options can
|
a speed-up for specific calculations. For example:
|
||||||
provide a speed-up for specific calculations. For example:
|
|
||||||
|
|
||||||
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm no -in in.lj # Newton off, full neighbor list, non-threaded comm
|
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos comm host -in in.reax # Newton on, half neighbor list, threaded comm
|
||||||
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton on neigh half comm no -in in.reax # Newton on, half neighbor list, non-threaded comm :pre
|
mpirun -np 64 lmp_kokkos_phi -k on t 4 -sf kk -pk kokkos newton off neigh full comm no -in in.lj # Newton off, full neighbor list, non-threaded comm :pre
|
||||||
|
|
||||||
NOTE: MPI tasks and threads should be bound to cores as described
|
NOTE: MPI tasks and threads should be bound to cores as described
|
||||||
above for CPUs.
|
above for CPUs.
|
||||||
|
@ -236,19 +229,19 @@ one or more nodes, each with two GPUs:
|
||||||
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj # 1 node, 2 MPI tasks/node, 2 GPUs/node
|
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj # 1 node, 2 MPI tasks/node, 2 GPUs/node
|
||||||
mpirun -np 32 -ppn 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj # 16 nodes, 2 MPI tasks/node, 2 GPUs/node (32 GPUs total) :pre
|
mpirun -np 32 -ppn 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -in in.lj # 16 nodes, 2 MPI tasks/node, 2 GPUs/node (32 GPUs total) :pre
|
||||||
|
|
||||||
NOTE: The default for the "package kokkos"_package.html command is to
|
NOTE: The default for the "package kokkos"_package.html command when
|
||||||
use "full" neighbor lists and set the Newton flag to "off" for both
|
running on GPUs is to use "full" neighbor lists and set the Newton flag
|
||||||
pairwise and bonded interactions, along with threaded communication.
|
to "off" for both pairwise and bonded interactions, along with threaded
|
||||||
When running on Maxwell or Kepler GPUs, this will typically be
|
communication. When running on Maxwell or Kepler GPUs, this will
|
||||||
best. For Pascal GPUs, using "half" neighbor lists and setting the
|
typically be best. For Pascal GPUs, using "half" neighbor lists and
|
||||||
Newton flag to "on" may be faster. For many pair styles, setting the
|
setting the Newton flag to "on" may be faster. For many pair styles,
|
||||||
neighbor binsize equal to the ghost atom cutoff will give speedup.
|
setting the neighbor binsize equal to twice the CPU default value will
|
||||||
Use the "-pk kokkos" "command-line switch"_Run_options.html to change
|
give speedup, which is the default when running on GPUs. Use the "-pk
|
||||||
the default "package kokkos"_package.html options. See its doc page
|
kokkos" "command-line switch"_Run_options.html to change the default
|
||||||
for details and default settings. Experimenting with its options can
|
"package kokkos"_package.html options. See its doc page for details and
|
||||||
provide a speed-up for specific calculations. For example:
|
default settings. Experimenting with its options can provide a speed-up
|
||||||
|
for specific calculations. For example:
|
||||||
|
|
||||||
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos binsize 2.8 -in in.lj # Set binsize = neighbor ghost cutoff
|
|
||||||
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj # Newton on, half neighbor list, set binsize = neighbor ghost cutoff :pre
|
mpirun -np 2 lmp_kokkos_cuda_openmpi -k on g 2 -sf kk -pk kokkos newton on neigh half binsize 2.8 -in in.lj # Newton on, half neighbor list, set binsize = neighbor ghost cutoff :pre
|
||||||
|
|
||||||
NOTE: For good performance of the KOKKOS package on GPUs, you must
|
NOTE: For good performance of the KOKKOS package on GPUs, you must
|
||||||
|
|
|
@ -64,7 +64,7 @@ args = arguments specific to the style :l
|
||||||
{no_affinity} values = none
|
{no_affinity} values = none
|
||||||
{kokkos} args = keyword value ...
|
{kokkos} args = keyword value ...
|
||||||
zero or more keyword/value pairs may be appended
|
zero or more keyword/value pairs may be appended
|
||||||
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
|
keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
|
||||||
{neigh} value = {full} or {half}
|
{neigh} value = {full} or {half}
|
||||||
full = full neighbor list
|
full = full neighbor list
|
||||||
half = half neighbor list built in thread-safe manner
|
half = half neighbor list built in thread-safe manner
|
||||||
|
@ -72,7 +72,7 @@ args = arguments specific to the style :l
|
||||||
full = full neighbor list
|
full = full neighbor list
|
||||||
half = half neighbor list built in thread-safe manner
|
half = half neighbor list built in thread-safe manner
|
||||||
{newton} = {off} or {on}
|
{newton} = {off} or {on}
|
||||||
off = set Newton pairwise and bonded flags off (default)
|
off = set Newton pairwise and bonded flags off
|
||||||
on = set Newton pairwise and bonded flags on
|
on = set Newton pairwise and bonded flags on
|
||||||
{binsize} value = size
|
{binsize} value = size
|
||||||
size = bin size for neighbor list construction (distance units)
|
size = bin size for neighbor list construction (distance units)
|
||||||
|
@ -425,98 +425,100 @@ processes/threads used for LAMMPS.
|
||||||
The {kokkos} style invokes settings associated with the use of the
|
The {kokkos} style invokes settings associated with the use of the
|
||||||
KOKKOS package.
|
KOKKOS package.
|
||||||
|
|
||||||
All of the settings are optional keyword/value pairs. Each has a
|
All of the settings are optional keyword/value pairs. Each has a default
|
||||||
default value as listed below.
|
value as listed below.
|
||||||
|
|
||||||
The {neigh} keyword determines how neighbor lists are built. A value
|
The {neigh} keyword determines how neighbor lists are built. A value of
|
||||||
of {half} uses a thread-safe variant of half-neighbor lists,
|
{half} uses a thread-safe variant of half-neighbor lists, the same as
|
||||||
the same as used by most pair styles in LAMMPS.
|
used by most pair styles in LAMMPS, which is the default when running on
|
||||||
|
CPUs (i.e. the Kokkos CUDA back end is not enabled).
|
||||||
|
|
||||||
A value of {full} uses a full neighbor lists and is the default. This
|
A value of {full} uses a full neighbor lists and is the default when
|
||||||
performs twice as much computation as the {half} option, however that
|
running on GPUs. This performs twice as much computation as the {half}
|
||||||
is often a win because it is thread-safe and doesn't require atomic
|
option, however that is often a win because it is thread-safe and
|
||||||
operations in the calculation of pair forces. For that reason, {full}
|
doesn't require atomic operations in the calculation of pair forces. For
|
||||||
is the default setting. However, when running in MPI-only mode with 1
|
that reason, {full} is the default setting for GPUs. However, when
|
||||||
thread per MPI task, {half} neighbor lists will typically be faster,
|
running on CPUs, a {half} neighbor list is the default because it are
|
||||||
just as it is for non-accelerated pair styles. Similarly, the {neigh/qeq}
|
often faster, just as it is for non-accelerated pair styles. Similarly,
|
||||||
keyword determines how neighbor lists are built for "fix qeq/reax/kk"_fix_qeq_reax.html.
|
the {neigh/qeq} keyword determines how neighbor lists are built for "fix
|
||||||
If not explicitly set, the value of {neigh/qeq} will match {neigh}.
|
qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of
|
||||||
|
{neigh/qeq} will match {neigh}.
|
||||||
|
|
||||||
The {newton} keyword sets the Newton flags for pairwise and bonded
|
The {newton} keyword sets the Newton flags for pairwise and bonded
|
||||||
interactions to {off} or {on}, the same as the "newton"_newton.html
|
interactions to {off} or {on}, the same as the "newton"_newton.html
|
||||||
command allows. The default is {off} because this will almost always
|
command allows. The default for GPUs is {off} because this will almost
|
||||||
give better performance for the KOKKOS package. This means more
|
always give better performance for the KOKKOS package. This means more
|
||||||
computation is done, but less communication. However, when running in
|
computation is done, but less communication. However, when running on
|
||||||
MPI-only mode with 1 thread per MPI task, a value of {on} will
|
CPUs a value of {on} is the default since it can often be faster, just
|
||||||
typically be faster, just as it is for non-accelerated pair styles.
|
as it is for non-accelerated pair styles
|
||||||
|
|
||||||
The {binsize} keyword sets the size of bins used to bin atoms in
|
The {binsize} keyword sets the size of bins used to bin atoms in
|
||||||
neighbor list builds. The same value can be set by the "neigh_modify
|
neighbor list builds. The same value can be set by the "neigh_modify
|
||||||
binsize"_neigh_modify.html command. Making it an option in the
|
binsize"_neigh_modify.html command. Making it an option in the package
|
||||||
package kokkos command allows it to be set from the command line. The
|
kokkos command allows it to be set from the command line. The default
|
||||||
default value is 0.0, which means the LAMMPS default will be used,
|
value for CPUs is 0.0, which means the LAMMPS default will be used,
|
||||||
which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
|
which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
|
||||||
distance. This is fine when neighbor lists are built on the CPU. For
|
distance. This is fine when neighbor lists are built on the CPU. For GPU
|
||||||
GPU builds, a 2x larger binsize equal to the pairwise cutoff +
|
builds, a 2x larger binsize equal to the pairwise cutoff + neighbor skin
|
||||||
neighbor skin, is often faster, which can be set by this keyword.
|
is often faster, which is the default. Note that if you use a
|
||||||
Note that if you use a longer-than-usual pairwise cutoff, e.g. to
|
longer-than-usual pairwise cutoff, e.g. to allow for a smaller fraction
|
||||||
allow for a smaller fraction of KSpace work with a "long-range
|
of KSpace work with a "long-range Coulombic solver"_kspace_style.html
|
||||||
Coulombic solver"_kspace_style.html because the GPU is faster at
|
because the GPU is faster at performing pairwise interactions, then this
|
||||||
performing pairwise interactions, then this rule of thumb may give too
|
rule of thumb may give too large a binsize and the default should be
|
||||||
large a binsize.
|
overridden with a smaller value.
|
||||||
|
|
||||||
The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
|
The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse}
|
||||||
whether the host or device performs the packing and unpacking of data
|
keywords determine whether the host or device performs the packing and
|
||||||
when communicating per-atom data between processors. "Exchange"
|
unpacking of data when communicating per-atom data between processors.
|
||||||
communication happens only on timesteps that neighbor lists are
|
"Exchange" communication happens only on timesteps that neighbor lists
|
||||||
rebuilt. The data is only for atoms that migrate to new processors.
|
are rebuilt. The data is only for atoms that migrate to new processors.
|
||||||
"Forward" communication happens every timestep. "Reverse" communication
|
"Forward" communication happens every timestep. "Reverse" communication
|
||||||
happens every timestep if the {newton} option is on. The data is for atom
|
happens every timestep if the {newton} option is on. The data is for
|
||||||
coordinates and any other atom properties that needs to be updated for
|
atom coordinates and any other atom properties that needs to be updated
|
||||||
ghost atoms owned by each processor.
|
for ghost atoms owned by each processor.
|
||||||
|
|
||||||
The {comm} keyword is simply a short-cut to set the same value
|
The {comm} keyword is simply a short-cut to set the same value for both
|
||||||
for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
|
the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
|
||||||
|
|
||||||
The value options for all 3 keywords are {no} or {host} or {device}.
|
The value options for all 3 keywords are {no} or {host} or {device}. A
|
||||||
A value of {no} means to use the standard non-KOKKOS method of
|
value of {no} means to use the standard non-KOKKOS method of
|
||||||
packing/unpacking data for the communication. A value of {host} means
|
packing/unpacking data for the communication. A value of {host} means to
|
||||||
to use the host, typically a multi-core CPU, and perform the
|
use the host, typically a multi-core CPU, and perform the
|
||||||
packing/unpacking in parallel with threads. A value of {device}
|
packing/unpacking in parallel with threads. A value of {device} means to
|
||||||
means to use the device, typically a GPU, to perform the
|
use the device, typically a GPU, to perform the packing/unpacking
|
||||||
packing/unpacking operation.
|
operation.
|
||||||
|
|
||||||
The optimal choice for these keywords depends on the input script and
|
The optimal choice for these keywords depends on the input script and
|
||||||
the hardware used. The {no} value is useful for verifying that the
|
the hardware used. The {no} value is useful for verifying that the
|
||||||
Kokkos-based {host} and {device} values are working correctly.
|
Kokkos-based {host} and {device} values are working correctly. It is the
|
||||||
It may also be the fastest choice when using Kokkos styles in
|
default when running on CPUs since it is usually the fastest.
|
||||||
MPI-only mode (i.e. with a thread count of 1).
|
|
||||||
|
|
||||||
When running on CPUs or Xeon Phi, the {host} and {device} values work
|
When running on CPUs or Xeon Phi, the {host} and {device} values work
|
||||||
identically. When using GPUs, the {device} value will typically be
|
identically. When using GPUs, the {device} value is the default since it
|
||||||
optimal if all of your styles used in your input script are supported
|
will typically be optimal if all of your styles used in your input
|
||||||
by the KOKKOS package. In this case data can stay on the GPU for many
|
script are supported by the KOKKOS package. In this case data can stay
|
||||||
timesteps without being moved between the host and GPU, if you use the
|
on the GPU for many timesteps without being moved between the host and
|
||||||
{device} value. This requires that your MPI is able to access GPU
|
GPU, if you use the {device} value. This requires that your MPI is able
|
||||||
memory directly. Currently that is true for OpenMPI 1.8 (or later
|
to access GPU memory directly. Currently that is true for OpenMPI 1.8
|
||||||
versions), Mvapich2 1.9 (or later), and CrayMPI. If your script uses
|
(or later versions), Mvapich2 1.9 (or later), and CrayMPI. If your
|
||||||
styles (e.g. fixes) which are not yet supported by the KOKKOS package,
|
script uses styles (e.g. fixes) which are not yet supported by the
|
||||||
then data has to be move between the host and device anyway, so it is
|
KOKKOS package, then data has to be move between the host and device
|
||||||
typically faster to let the host handle communication, by using the
|
anyway, so it is typically faster to let the host handle communication,
|
||||||
{host} value. Using {host} instead of {no} will enable use of
|
by using the {host} value. Using {host} instead of {no} will enable use
|
||||||
multiple threads to pack/unpack communicated data.
|
of multiple threads to pack/unpack communicated data.
|
||||||
|
|
||||||
The {gpu/direct} keyword chooses whether GPU-direct will be used. When
|
The {gpu/direct} keyword chooses whether GPU-direct will be used. When
|
||||||
this keyword is set to {on}, buffers in GPU memory are passed directly
|
this keyword is set to {on}, buffers in GPU memory are passed directly
|
||||||
through MPI send/receive calls. This reduces overhead of first copying
|
through MPI send/receive calls. This reduces overhead of first copying
|
||||||
the data to the host CPU. However GPU-direct is not supported on all
|
the data to the host CPU. However GPU-direct is not supported on all
|
||||||
systems, which can lead to segmentation faults and would require
|
systems, which can lead to segmentation faults and would require using a
|
||||||
using a value of {off}. If LAMMPS can safely detect that GPU-direct is
|
value of {off}. If LAMMPS can safely detect that GPU-direct is not
|
||||||
not available (currently only possible with OpenMPI v2.0.0 or later),
|
available (currently only possible with OpenMPI v2.0.0 or later), then
|
||||||
then the {gpu/direct} keyword is automatically set to {off} by default.
|
the {gpu/direct} keyword is automatically set to {off} by default. When
|
||||||
When the {gpu/direct} keyword is set to {off} while any of the {comm}
|
the {gpu/direct} keyword is set to {off} while any of the {comm}
|
||||||
keywords are set to {device}, the value for these {comm} keywords will
|
keywords are set to {device}, the value for these {comm} keywords will
|
||||||
be automatically changed to {host}.
|
be automatically changed to {host}. This setting has no effect if not
|
||||||
|
running on GPUs.
|
||||||
|
|
||||||
:line
|
:line
|
||||||
|
|
||||||
|
@ -623,14 +625,16 @@ not used, you must invoke the package intel command in your input
|
||||||
script or or via the "-pk intel" "command-line
|
script or or via the "-pk intel" "command-line
|
||||||
switch"_Run_options.html.
|
switch"_Run_options.html.
|
||||||
|
|
||||||
For the KOKKOS package, the option defaults neigh = full, neigh/qeq =
|
For the KOKKOS package, the option defaults for GPUs are neigh = full,
|
||||||
full, newton = off, binsize = 0.0, and comm = device, gpu/direct = on.
|
neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default
|
||||||
When LAMMPS can safely detect, that GPU-direct is not available, the
|
value, comm = device, gpu/direct = on. When LAMMPS can safely detect
|
||||||
default value of gpu/direct becomes "off".
|
that GPU-direct is not available, the default value of gpu/direct
|
||||||
These settings are made automatically by the required "-k on"
|
becomes "off". For CPUs or Xeon Phis, the option defaults are neigh =
|
||||||
"command-line switch"_Run_options.html. You can change them by
|
half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. These
|
||||||
using the package kokkos command in your input script or via the
|
settings are made automatically by the required "-k on" "command-line
|
||||||
"-pk kokkos command-line switch"_Run_options.html.
|
switch"_Run_options.html. You can change them by using the package
|
||||||
|
kokkos command in your input script or via the "-pk kokkos command-line
|
||||||
|
switch"_Run_options.html.
|
||||||
|
|
||||||
For the OMP package, the default is Nthreads = 0 and the option
|
For the OMP package, the default is Nthreads = 0 and the option
|
||||||
defaults are neigh = yes. These settings are made automatically if
|
defaults are neigh = yes. These settings are made automatically if
|
||||||
|
|
|
@ -182,16 +182,28 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
|
||||||
|
|
||||||
// default settings for package kokkos command
|
// default settings for package kokkos command
|
||||||
|
|
||||||
|
binsize = 0.0;
|
||||||
|
gpu_direct_flag = 1;
|
||||||
|
if (ngpu > 0) {
|
||||||
neighflag = FULL;
|
neighflag = FULL;
|
||||||
neighflag_qeq = FULL;
|
neighflag_qeq = FULL;
|
||||||
neighflag_qeq_set = 0;
|
neighflag_qeq_set = 0;
|
||||||
exchange_comm_classic = 0;
|
newtonflag = 0;
|
||||||
forward_comm_classic = 0;
|
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
|
||||||
reverse_comm_classic = 0;
|
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
|
||||||
exchange_comm_on_host = 0;
|
} else {
|
||||||
forward_comm_on_host = 0;
|
if (num_threads > 1) {
|
||||||
reverse_comm_on_host = 0;
|
neighflag = HALFTHREAD;
|
||||||
gpu_direct_flag = 1;
|
neighflag_qeq = HALFTHREAD;
|
||||||
|
} else {
|
||||||
|
neighflag = HALF;
|
||||||
|
neighflag_qeq = HALF;
|
||||||
|
}
|
||||||
|
neighflag_qeq_set = 0;
|
||||||
|
newtonflag = 1;
|
||||||
|
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
|
||||||
|
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
|
||||||
|
}
|
||||||
|
|
||||||
#if KOKKOS_USE_CUDA
|
#if KOKKOS_USE_CUDA
|
||||||
// only if we can safely detect, that GPU-direct is not available, change default
|
// only if we can safely detect, that GPU-direct is not available, change default
|
||||||
|
@ -218,17 +230,6 @@ KokkosLMP::~KokkosLMP()
|
||||||
|
|
||||||
void KokkosLMP::accelerator(int narg, char **arg)
|
void KokkosLMP::accelerator(int narg, char **arg)
|
||||||
{
|
{
|
||||||
// defaults
|
|
||||||
|
|
||||||
neighflag = FULL;
|
|
||||||
neighflag_qeq = FULL;
|
|
||||||
neighflag_qeq_set = 0;
|
|
||||||
int newtonflag = 0;
|
|
||||||
double binsize = 0.0;
|
|
||||||
exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
|
|
||||||
exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
|
|
||||||
gpu_direct_flag = 1;
|
|
||||||
|
|
||||||
int iarg = 0;
|
int iarg = 0;
|
||||||
while (iarg < narg) {
|
while (iarg < narg) {
|
||||||
if (strcmp(arg[iarg],"neigh") == 0) {
|
if (strcmp(arg[iarg],"neigh") == 0) {
|
||||||
|
|
|
@ -36,6 +36,8 @@ class KokkosLMP : protected Pointers {
|
||||||
int numa;
|
int numa;
|
||||||
int auto_sync;
|
int auto_sync;
|
||||||
int gpu_direct_flag;
|
int gpu_direct_flag;
|
||||||
|
int newtonflag;
|
||||||
|
double binsize;
|
||||||
|
|
||||||
KokkosLMP(class LAMMPS *, int, char **);
|
KokkosLMP(class LAMMPS *, int, char **);
|
||||||
~KokkosLMP();
|
~KokkosLMP();
|
||||||
|
|
|
@ -30,6 +30,7 @@
|
||||||
#include "style_nstencil.h"
|
#include "style_nstencil.h"
|
||||||
#include "style_npair.h"
|
#include "style_npair.h"
|
||||||
#include "style_ntopo.h"
|
#include "style_ntopo.h"
|
||||||
|
#include "comm.h"
|
||||||
|
|
||||||
using namespace LAMMPS_NS;
|
using namespace LAMMPS_NS;
|
||||||
|
|
||||||
|
@ -359,6 +360,14 @@ void NeighborKokkos::modify_mol_intra_grow_kokkos(){
|
||||||
k_ex_mol_intra.modify<LMPHostType>();
|
k_ex_mol_intra.modify<LMPHostType>();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ---------------------------------------------------------------------- */
|
||||||
|
void NeighborKokkos::set_binsize_kokkos() {
|
||||||
|
if (!binsizeflag && lmp->kokkos->ngpu > 0) {
|
||||||
|
binsize_user = cutneighmax;
|
||||||
|
binsizeflag = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* ---------------------------------------------------------------------- */
|
/* ---------------------------------------------------------------------- */
|
||||||
|
|
||||||
void NeighborKokkos::init_topology() {
|
void NeighborKokkos::init_topology() {
|
||||||
|
|
|
@ -87,6 +87,7 @@ class NeighborKokkos : public Neighbor {
|
||||||
void modify_ex_group_grow_kokkos();
|
void modify_ex_group_grow_kokkos();
|
||||||
void modify_mol_group_grow_kokkos();
|
void modify_mol_group_grow_kokkos();
|
||||||
void modify_mol_intra_grow_kokkos();
|
void modify_mol_intra_grow_kokkos();
|
||||||
|
void set_binsize_kokkos();
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -471,6 +471,9 @@ void Neighbor::init()
|
||||||
error->warning(FLERR,"Neighbor exclusions used with KSpace solver "
|
error->warning(FLERR,"Neighbor exclusions used with KSpace solver "
|
||||||
"may give inconsistent Coulombic energies");
|
"may give inconsistent Coulombic energies");
|
||||||
|
|
||||||
|
if (lmp->kokkos)
|
||||||
|
set_binsize_kokkos();
|
||||||
|
|
||||||
// ------------------------------------------------------------------
|
// ------------------------------------------------------------------
|
||||||
// create pairwise lists
|
// create pairwise lists
|
||||||
// one-time call to init_styles() to scan style files and setup
|
// one-time call to init_styles() to scan style files and setup
|
||||||
|
|
|
@ -233,6 +233,7 @@ class Neighbor : protected Pointers {
|
||||||
virtual void init_ex_bit_kokkos() {}
|
virtual void init_ex_bit_kokkos() {}
|
||||||
virtual void init_ex_mol_bit_kokkos() {}
|
virtual void init_ex_mol_bit_kokkos() {}
|
||||||
virtual void grow_ex_mol_intra_kokkos() {}
|
virtual void grow_ex_mol_intra_kokkos() {}
|
||||||
|
virtual void set_binsize_kokkos() {}
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace NeighConst {
|
namespace NeighConst {
|
||||||
|
|
Loading…
Reference in New Issue