forked from lijiext/lammps
772 lines
44 KiB
HTML
772 lines
44 KiB
HTML
|
|
|
|
<!DOCTYPE html>
|
|
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
|
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
|
<head>
|
|
<meta charset="utf-8">
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
|
<title>package command — LAMMPS documentation</title>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="_static/sphinxcontrib-images/LightBox2/lightbox2/css/lightbox.css" type="text/css" />
|
|
|
|
|
|
|
|
<link rel="top" title="LAMMPS documentation" href="index.html"/>
|
|
|
|
|
|
<script src="_static/js/modernizr.min.js"></script>
|
|
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav" role="document">
|
|
|
|
<div class="wy-grid-for-nav">
|
|
|
|
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
|
<div class="wy-side-nav-search">
|
|
|
|
|
|
|
|
<a href="Manual.html" class="icon icon-home"> LAMMPS
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<div role="search">
|
|
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" />
|
|
<input type="hidden" name="check_keywords" value="yes" />
|
|
<input type="hidden" name="area" value="default" />
|
|
</form>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
|
|
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
|
|
|
|
|
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_intro.html">1. Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_start.html">2. Getting Started</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_commands.html">3. Commands</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_packages.html">4. Packages</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_accelerate.html">5. Accelerating LAMMPS performance</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_howto.html">6. How-to discussions</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_example.html">7. Example problems</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_perf.html">8. Performance & scalability</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_tools.html">9. Additional tools</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_modify.html">10. Modifying & extending LAMMPS</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_python.html">11. Python interface to LAMMPS</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_errors.html">12. Errors</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_history.html">13. Future and history</a></li>
|
|
</ul>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
|
|
|
|
|
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="Manual.html">LAMMPS</a>
|
|
</nav>
|
|
|
|
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="breadcrumbs navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="Manual.html">Docs</a> »</li>
|
|
|
|
<li>package command</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
|
|
|
|
<a href="http://lammps.sandia.gov">Website</a>
|
|
<a href="Section_commands.html#comm">Commands</a>
|
|
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
|
|
</div>
|
|
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
|
<div itemprop="articleBody">
|
|
|
|
<div class="section" id="package-command">
|
|
<span id="index-0"></span><h1>package command<a class="headerlink" href="#package-command" title="Permalink to this headline">¶</a></h1>
|
|
<div class="section" id="syntax">
|
|
<h2>Syntax<a class="headerlink" href="#syntax" title="Permalink to this headline">¶</a></h2>
|
|
<div class="highlight-python"><div class="highlight"><pre>package style args
|
|
</pre></div>
|
|
</div>
|
|
<ul class="simple">
|
|
<li>style = <em>cuda</em> or <em>gpu</em> or <em>intel</em> or <em>kokkos</em> or <em>omp</em></li>
|
|
<li>args = arguments specific to the style</li>
|
|
</ul>
|
|
<pre class="literal-block">
|
|
<em>cuda</em> args = Ngpu keyword value ...
|
|
Ngpu = # of GPUs per node
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>newton</em> or <em>gpuID</em> or <em>timing</em> or <em>test</em> or <em>thread</em>
|
|
<em>newton</em> = <em>off</em> or <em>on</em>
|
|
off = set Newton pairwise and bonded flags off (default)
|
|
on = set Newton pairwise and bonded flags on
|
|
<em>gpuID</em> values = gpu1 .. gpuN
|
|
gpu1 .. gpuN = IDs of the Ngpu GPUs to use
|
|
<em>timing</em> values = none
|
|
<em>test</em> values = id
|
|
id = atom-ID of a test particle
|
|
<em>thread</em> = auto or tpa or bpa
|
|
auto = test whether tpa or bpa is faster
|
|
tpa = one thread per atom
|
|
bpa = one block per atom
|
|
<em>gpu</em> args = Ngpu keyword value ...
|
|
Ngpu = # of GPUs per node
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>neigh</em> or <em>newton</em> or <em>binsize</em> or <em>split</em> or <em>gpuID</em> or <em>tpa</em> or <em>device</em> or <em>blocksize</em>
|
|
<em>neigh</em> value = <em>yes</em> or <em>no</em>
|
|
yes = neighbor list build on GPU (default)
|
|
no = neighbor list build on CPU
|
|
<em>newton</em> = <em>off</em> or <em>on</em>
|
|
off = set Newton pairwise flag off (default and required)
|
|
on = set Newton pairwise flag on (currently not allowed)
|
|
<em>binsize</em> value = size
|
|
size = bin size for neighbor list construction (distance units)
|
|
<em>split</em> = fraction
|
|
fraction = fraction of atoms assigned to GPU (default = 1.0)
|
|
<em>gpuID</em> values = first last
|
|
first = ID of first GPU to be used on each node
|
|
last = ID of last GPU to be used on each node
|
|
<em>tpa</em> value = Nthreads
|
|
Nthreads = # of GPU threads used per atom
|
|
<em>device</em> value = device_type
|
|
device_type = <em>kepler</em> or <em>fermi</em> or <em>cypress</em> or <em>generic</em>
|
|
<em>blocksize</em> value = size
|
|
size = thread block size for pair force computation
|
|
<em>intel</em> args = NPhi keyword value ...
|
|
Nphi = # of coprocessors per node
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>omp</em> or <em>mode</em> or <em>balance</em> or <em>ghost</em> or <em>tpc</em> or <em>tptask</em> or <em>no_affinity</em>
|
|
<em>omp</em> value = Nthreads
|
|
Nthreads = number of OpenMP threads to use on CPU (default = 0)
|
|
<em>mode</em> value = <em>single</em> or <em>mixed</em> or <em>double</em>
|
|
single = perform force calculations in single precision
|
|
mixed = perform force calculations in mixed precision
|
|
double = perform force calculations in double precision
|
|
<em>balance</em> value = split
|
|
split = fraction of work to offload to coprocessor, -1 for dynamic
|
|
<em>ghost</em> value = <em>yes</em> or <em>no</em>
|
|
yes = include ghost atoms for offload
|
|
no = do not include ghost atoms for offload
|
|
<em>tpc</em> value = Ntpc
|
|
Ntpc = max number of coprocessor threads per coprocessor core (default = 4)
|
|
<em>tptask</em> value = Ntptask
|
|
Ntptask = max number of coprocessor threads per MPI task (default = 240)
|
|
<em>no_affinity</em> values = none
|
|
<em>kokkos</em> args = keyword value ...
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>neigh</em> or <em>newton</em> or <em>binsize</em> or <em>comm</em> or <em>comm/exchange</em> or <em>comm/forward</em>
|
|
<em>neigh</em> value = <em>full</em> or <em>half/thread</em> or <em>half</em> or <em>n2</em> or <em>full/cluster</em>
|
|
full = full neighbor list
|
|
half/thread = half neighbor list built in thread-safe manner
|
|
half = half neighbor list, not thread-safe, only use when 1 thread/MPI task
|
|
n2 = non-binning neighbor list build, O(N^2) algorithm
|
|
full/cluster = full neighbor list with clustered groups of atoms
|
|
<em>newton</em> = <em>off</em> or <em>on</em>
|
|
off = set Newton pairwise and bonded flags off (default)
|
|
on = set Newton pairwise and bonded flags on
|
|
<em>binsize</em> value = size
|
|
size = bin size for neighbor list construction (distance units)
|
|
<em>comm</em> value = <em>no</em> or <em>host</em> or <em>device</em>
|
|
use value for both comm/exchange and comm/forward
|
|
<em>comm/exchange</em> value = <em>no</em> or <em>host</em> or <em>device</em>
|
|
<em>comm/forward</em> value = <em>no</em> or <em>host</em> or <em>device</em>
|
|
no = perform communication pack/unpack in non-KOKKOS mode
|
|
host = perform pack/unpack on host (e.g. with OpenMP threading)
|
|
device = perform pack/unpack on device (e.g. on GPU)
|
|
<em>omp</em> args = Nthreads keyword value ...
|
|
Nthread = # of OpenMP threads to associate with each MPI process
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>neigh</em>
|
|
<em>neigh</em> value = <em>yes</em> or <em>no</em>
|
|
yes = threaded neighbor list build (default)
|
|
no = non-threaded neighbor list build
|
|
</pre>
|
|
</div>
|
|
<div class="section" id="examples">
|
|
<h2>Examples<a class="headerlink" href="#examples" title="Permalink to this headline">¶</a></h2>
|
|
<div class="highlight-python"><div class="highlight"><pre>package gpu 1
|
|
package gpu 1 split 0.75
|
|
package gpu 2 split -1.0
|
|
package cuda 2 gpuID 0 2
|
|
package cuda 1 test 3948
|
|
package kokkos neigh half/thread comm device
|
|
package omp 0 neigh no
|
|
package omp 4
|
|
package intel 1
|
|
package intel 2 omp 4 mode mixed balance 0.5
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="description">
|
|
<h2>Description<a class="headerlink" href="#description" title="Permalink to this headline">¶</a></h2>
|
|
<p>This command invokes package-specific settings for the various
|
|
accelerator packages available in LAMMPS. Currently the following
|
|
packages use settings from this command: USER-CUDA, GPU, USER-INTEL,
|
|
KOKKOS, and USER-OMP.</p>
|
|
<p>If this command is specified in an input script, it must be near the
|
|
top of the script, before the simulation box has been defined. This
|
|
is because it specifies settings that the accelerator packages use in
|
|
their intialization, before a simultion is defined.</p>
|
|
<p>This command can also be specified from the command-line when
|
|
launching LAMMPS, using the “-pk” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>. The syntax is exactly the same as
|
|
when used in an input script.</p>
|
|
<p>Note that all of the accelerator packages require the package command
|
|
to be specified (except the OPT package), if the package is to be used
|
|
in a simulation (LAMMPS can be built with an accelerator package
|
|
without using it in a particular simulation). However, in all cases,
|
|
a default version of the command is typically invoked by other
|
|
accelerator settings.</p>
|
|
<p>The USER-CUDA and KOKKOS packages require a “-c on” or “-k on”
|
|
<a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> respectively, which
|
|
invokes a “package cuda” or “package kokkos” command with default
|
|
settings.</p>
|
|
<p>For the GPU, USER-INTEL, and USER-OMP packages, if a “-sf gpu” or “-sf
|
|
intel” or “-sf omp” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>
|
|
is used to auto-append accelerator suffixes to various styles in the
|
|
input script, then those switches also invoke a “package gpu”,
|
|
“package intel”, or “package omp” command with default settings.</p>
|
|
<div class="admonition note">
|
|
<p class="first admonition-title">Note</p>
|
|
<p class="last">A package command for a particular style can be invoked multiple
|
|
times when a simulation is setup, e.g. by the “-c on”, “-k on”, “-sf”,
|
|
and “-pk” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switches</span></a>, and by
|
|
using this command in an input script. Each time it is used all of
|
|
the style options are set, either to default values or to specified
|
|
settings. I.e. settings from previous invocations do not persist
|
|
across multiple invocations.</p>
|
|
</div>
|
|
<p>See the <a class="reference internal" href="Section_accelerate.html"><em>Section Accelerate</em></a> section of the
|
|
manual for more details about using the various accelerator packages
|
|
for speeding up LAMMPS simulations.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>cuda</em> style invokes settings associated with the use of the
|
|
USER-CUDA package.</p>
|
|
<p>The <em>Ngpus</em> argument sets the number of GPUs per node. There must be
|
|
exactly one MPI task per GPU, as set by the mpirun or mpiexec command.</p>
|
|
<p>Optional keyword/value pairs can also be specified. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>newton</em> keyword sets the Newton flags for pairwise and bonded
|
|
interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><em>newton</em></a>
|
|
command allows. The default is <em>off</em> because this will almost always
|
|
give better performance for the USER-CUDA package. This means
|
|
more computation is done, but less communication.</p>
|
|
<p>The <em>gpuID</em> keyword allows selection of which GPUs on each node will
|
|
be used for a simulation. GPU IDs range from 0 to N-1 where N is the
|
|
physical number of GPUs/node. An ID is specified for each of the
|
|
Ngpus being used. For example if you have three GPUs on a machine,
|
|
one of which is used for the X-Server (the GPU with the ID 1) while
|
|
the others (with IDs 0 and 2) are used for computations you would
|
|
specify:</p>
|
|
<div class="highlight-python"><div class="highlight"><pre>package cuda 2 gpuID 0 2
|
|
</pre></div>
|
|
</div>
|
|
<p>The purpose of the <em>gpuID</em> keyword is to allow two (or more)
|
|
simulations to be run on one workstation. In that case one could set
|
|
the first simulation to use GPU 0 and the second to use GPU 1. This is
|
|
not necessary however, if the GPUs are in what is called <em>compute
|
|
exclusive</em> mode. Using that setting, every process will get its own
|
|
GPU automatically. This <em>compute exclusive</em> mode can be set as root
|
|
using the <em>nvidia-smi</em> tool which is part of the CUDA installation.</p>
|
|
<p>Also note that if the <em>gpuID</em> keyword is not used, the USER-CUDA
|
|
package sorts existing GPUs on each node according to their number of
|
|
multiprocessors. This way, compute GPUs will be priorized over
|
|
X-Server GPUs.</p>
|
|
<p>If the <em>timing</em> keyword is specified, detailed timing information for
|
|
various subroutines will be output.</p>
|
|
<p>If the <em>test</em> keyword is specified, information for the specified atom
|
|
with atom-ID will be output at several points during each timestep.
|
|
This is mainly usefull for debugging purposes. Note that the
|
|
simulation slow down dramatically if this option is used.</p>
|
|
<p>The <em>thread</em> keyword can be used to specify how GPU threads are
|
|
assigned work during pair style force evaluation. If the value =
|
|
<em>tpa</em>, one thread per atom is used. If the value = <em>bpa</em>, one block
|
|
per atom is used. If the value = <em>auto</em>, a short test is performed at
|
|
the beginning of each run to determing where <em>tpa</em> or <em>bpa</em> mode is
|
|
faster. The result of this test is output. Since <em>auto</em> is the
|
|
default value, it is usually not necessary to use this keyword.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>gpu</em> style invokes settings associated with the use of the GPU
|
|
package.</p>
|
|
<p>The <em>Ngpu</em> argument sets the number of GPUs per node. There must be
|
|
at least as many MPI tasks per node as GPUs, as set by the mpirun or
|
|
mpiexec command. If there are more MPI tasks (per node)
|
|
than GPUs, multiple MPI tasks will share each GPU.</p>
|
|
<p>Optional keyword/value pairs can also be specified. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>neigh</em> keyword specifies where neighbor lists for pair style
|
|
computation will be built. If <em>neigh</em> is <em>yes</em>, which is the default,
|
|
neighbor list building is performed on the GPU. If <em>neigh</em> is <em>no</em>,
|
|
neighbor list building is performed on the CPU. GPU neighbor list
|
|
building currently cannot be used with a triclinic box. GPU neighbor
|
|
list calculation currently cannot be used with
|
|
<a class="reference internal" href="pair_hybrid.html"><em>hybrid</em></a> pair styles. GPU neighbor lists are not
|
|
compatible with comannds that are not GPU-enabled. When a non-GPU
|
|
enabled command requires a neighbor list, it will also be built on the
|
|
CPU. In these cases, it will typically be more efficient to only use
|
|
CPU neighbor list builds.</p>
|
|
<p>The <em>newton</em> keyword sets the Newton flags for pairwise (not bonded)
|
|
interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><em>newton</em></a>
|
|
command allows. Currently, only an <em>off</em> value is allowed, since all
|
|
the GPU package pair styles require this setting. This means more
|
|
computation is done, but less communication. In the future a value of
|
|
<em>on</em> may be allowed, so the <em>newton</em> keyword is included as an option
|
|
for compatibility with the package command for other accelerator
|
|
styles. Note that the newton setting for bonded interactions is not
|
|
affected by this keyword.</p>
|
|
<p>The <em>binsize</em> keyword sets the size of bins used to bin atoms in
|
|
neighbor list builds performed on the GPU, if <em>neigh</em> = <em>yes</em> is set.
|
|
If <em>binsize</em> is set to 0.0 (the default), then bins = the size of the
|
|
pairwise cutoff + neighbor skin distance. This is 2x larger than the
|
|
LAMMPS default used for neighbor list building on the CPU. This will
|
|
be close to optimal for the GPU, so you do not normally need to use
|
|
this keyword. Note that if you use a longer-than-usual pairwise
|
|
cutoff, e.g. to allow for a smaller fraction of KSpace work with a
|
|
<a class="reference internal" href="kspace_style.html"><em>long-range Coulombic solver</em></a> because the GPU is
|
|
faster at performing pairwise interactions, then it may be optimal to
|
|
make the <em>binsize</em> smaller than the default. For example, with a
|
|
cutoff of 20*sigma in LJ <a class="reference internal" href="units.html"><em>units</em></a> and a neighbor skin
|
|
distance of sigma, a <em>binsize</em> = 5.25*sigma can be more efficient than
|
|
the default.</p>
|
|
<p>The <em>split</em> keyword can be used for load balancing force calculations
|
|
between CPU and GPU cores in GPU-enabled pair styles. If 0 < <em>split</em> <
|
|
1.0, a fixed fraction of particles is offloaded to the GPU while force
|
|
calculation for the other particles occurs simulataneously on the CPU.
|
|
If <em>split</em> < 0.0, the optimal fraction (based on CPU and GPU timings)
|
|
is calculated every 25 timesteps, i.e. dynamic load-balancing across
|
|
the CPU and GPU is performed. If <em>split</em> = 1.0, all force
|
|
calculations for GPU accelerated pair styles are performed on the GPU.
|
|
In this case, other <a class="reference internal" href="pair_hybrid.html"><em>hybrid</em></a> pair interactions,
|
|
<a class="reference internal" href="bond_style.html"><em>bond</em></a>, <a class="reference internal" href="angle_style.html"><em>angle</em></a>,
|
|
<a class="reference internal" href="dihedral_style.html"><em>dihedral</em></a>, <a class="reference internal" href="improper_style.html"><em>improper</em></a>, and
|
|
<a class="reference internal" href="kspace_style.html"><em>long-range</em></a> calculations can be performed on the
|
|
CPU while the GPU is performing force calculations for the GPU-enabled
|
|
pair style. If all CPU force computations complete before the GPU
|
|
completes, LAMMPS will block until the GPU has finished before
|
|
continuing the timestep.</p>
|
|
<p>As an example, if you have two GPUs per node and 8 CPU cores per node,
|
|
and would like to run on 4 nodes (32 cores) with dynamic balancing of
|
|
force calculation across CPU and GPU cores, you could specify</p>
|
|
<div class="highlight-python"><div class="highlight"><pre>mpirun -np 32 -sf gpu -in in.script # launch command
|
|
package gpu 2 split -1 # input script command
|
|
</pre></div>
|
|
</div>
|
|
<p>In this case, all CPU cores and GPU devices on the nodes would be
|
|
utilized. Each GPU device would be shared by 4 CPU cores. The CPU
|
|
cores would perform force calculations for some fraction of the
|
|
particles at the same time the GPUs performed force calculation for
|
|
the other particles.</p>
|
|
<p>The <em>gpuID</em> keyword allows selection of which GPUs on each node will
|
|
be used for a simulation. The <em>first</em> and <em>last</em> values specify the
|
|
GPU IDs to use (from 0 to Ngpu-1). By default, first = 0 and last =
|
|
Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
|
|
of physical GPUs. If you only wish to use a subset, set Ngpu to a
|
|
smaller number and first/last to a sub-range of the available GPUs.</p>
|
|
<p>The <em>tpa</em> keyword sets the number of GPU thread per atom used to
|
|
perform force calculations. With a default value of 1, the number of
|
|
threads will be chosen based on the pair style, however, the value can
|
|
be set explicitly with this keyword to fine-tune performance. For
|
|
large cutoffs or with a small number of particles per GPU, increasing
|
|
the value can improve performance. The number of threads per atom must
|
|
be a power of 2 and currently cannot be greater than 32.</p>
|
|
<p>The <em>device</em> keyword can be used to tune parameters optimized for a
|
|
specific accelerator, when using OpenCL. For CUDA, the <em>device</em>
|
|
keyword is ignored. Currently, the device type is limited to NVIDIA
|
|
Kepler, NVIDIA Fermi, AMD Cypress, or a generic device. More devices
|
|
may be added later. The default device type can be specified when
|
|
building LAMMPS with the GPU library, via settings in the
|
|
lib/gpu/Makefile that is used.</p>
|
|
<p>The <em>blocksize</em> keyword allows you to tweak the number of threads used
|
|
per thread block. This number should be a multiple of 32 (for GPUs)
|
|
and its maximum depends on the specific GPU hardware. Typical choices
|
|
are 64, 128, or 256. A larger blocksize increases occupancy of
|
|
individual GPU cores, but reduces the total number of thread blocks,
|
|
thus may lead to load imbalance.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>intel</em> style invokes settings associated with the use of the
|
|
USER-INTEL package. All of its settings, except the <em>omp</em> and <em>mode</em>
|
|
keywords, are ignored if LAMMPS was not built with Xeon Phi
|
|
coprocessor support. All of its settings, including the <em>omp</em> and
|
|
<em>mode</em> keyword are applicable if LAMMPS was built with coprocessor
|
|
support.</p>
|
|
<p>The <em>Nphi</em> argument sets the number of coprocessors per node.
|
|
This can be set to any value, including 0, if LAMMPS was not
|
|
built with coprocessor support.</p>
|
|
<p>Optional keyword/value pairs can also be specified. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>omp</em> keyword determines the number of OpenMP threads allocated
|
|
for each MPI task when any portion of the interactions computed by a
|
|
USER-INTEL pair style are run on the CPU. This can be the case even
|
|
if LAMMPS was built with coprocessor support; see the <em>balance</em>
|
|
keyword discussion below. If you are running with less MPI tasks/node
|
|
than there are CPUs, it can be advantageous to use OpenMP threading on
|
|
the CPUs.</p>
|
|
<div class="admonition note">
|
|
<p class="first admonition-title">Note</p>
|
|
<p class="last">The <em>omp</em> keyword has nothing to do with coprocessor threads on
|
|
the Xeon Phi; see the <em>tpc</em> and <em>tptask</em> keywords below for a
|
|
discussion of coprocessor threads.</p>
|
|
</div>
|
|
<p>The <em>Nthread</em> value for the <em>omp</em> keyword sets the number of OpenMP
|
|
threads allocated for each MPI task. Setting <em>Nthread</em> = 0 (the
|
|
default) instructs LAMMPS to use whatever value is the default for the
|
|
given OpenMP environment. This is usually determined via the
|
|
<em>OMP_NUM_THREADS</em> environment variable or the compiler runtime, which
|
|
is usually a value of 1.</p>
|
|
<p>For more details, including examples of how to set the OMP_NUM_THREADS
|
|
environment variable, see the discussion of the <em>Nthreads</em> setting on
|
|
this doc page for the “package omp” command. Nthreads is a required
|
|
argument for the USER-OMP package. Its meaning is exactly the same
|
|
for the USER-INTEL pacakge.</p>
|
|
<div class="admonition note">
|
|
<p class="first admonition-title">Note</p>
|
|
<p class="last">If you build LAMMPS with both the USER-INTEL and USER-OMP
|
|
packages, be aware that both packages allow setting of the <em>Nthreads</em>
|
|
value via their package commands, but there is only a single global
|
|
<em>Nthreads</em> value used by OpenMP. Thus if both package commands are
|
|
invoked, you should insure the two values are consistent. If they are
|
|
not, the last one invoked will take precedence, for both packages.
|
|
Also note that if the “-sf hybrid intel omp” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is used, it invokes a “package
|
|
intel” command, followed by a “package omp” command, both with a
|
|
setting of <em>Nthreads</em> = 0.</p>
|
|
</div>
|
|
<p>The <em>mode</em> keyword determines the precision mode to use for
|
|
computing pair style forces, either on the CPU or on the coprocessor,
|
|
when using a USER-INTEL supported <a class="reference internal" href="pair_style.html"><em>pair style</em></a>. It
|
|
can take a value of <em>single</em>, <em>mixed</em> which is the default, or
|
|
<em>double</em>. <em>Single</em> means single precision is used for the entire
|
|
force calculation. <em>Mixed</em> means forces between a pair of atoms are
|
|
computed in single precision, but accumulated and stored in double
|
|
precision, including storage of forces, torques, energies, and virial
|
|
quantities. <em>Double</em> means double precision is used for the entire
|
|
force calculation.</p>
|
|
<p>The <em>balance</em> keyword sets the fraction of <a class="reference internal" href="pair_style.html"><em>pair style</em></a> work offloaded to the coprocessor for split
|
|
values between 0.0 and 1.0 inclusive. While this fraction of work is
|
|
running on the coprocessor, other calculations will run on the host,
|
|
including neighbor and pair calculations that are not offloaded, as
|
|
well as angle, bond, dihedral, kspace, and some MPI communications.
|
|
If <em>split</em> is set to -1, the fraction of work is dynamically adjusted
|
|
automatically throughout the run. This typically give performance
|
|
within 5 to 10 percent of the optimal fixed fraction.</p>
|
|
<p>The <em>ghost</em> keyword determines whether or not ghost atoms, i.e. atoms
|
|
at the boundaries of proessor sub-domains, are offloaded for neighbor
|
|
and force calculations. When the value = “no”, ghost atoms are not
|
|
offloaded. This option can reduce the amount of data transfer with
|
|
the coprocessor and can also overlap MPI communication of forces with
|
|
computation on the coprocessor when the <a class="reference internal" href="newton.html"><em>newton pair</em></a>
|
|
setting is “on”. When the value = “yes”, ghost atoms are offloaded.
|
|
In some cases this can provide better performance, especially if the
|
|
<em>balance</em> fraction is high.</p>
|
|
<p>The <em>tpc</em> keyword sets the max # of coprocessor threads <em>Ntpc</em> that
|
|
will run on each core of the coprocessor. The default value = 4,
|
|
which is the number of hardware threads per core supported by the
|
|
current generation Xeon Phi chips.</p>
|
|
<p>The <em>tptask</em> keyword sets the max # of coprocessor threads (Ntptask*
|
|
assigned to each MPI task. The default value = 240, which is the
|
|
total # of threads an entire current generation Xeon Phi chip can run
|
|
(240 = 60 cores * 4 threads/core). This means each MPI task assigned
|
|
to the Phi will enough threads for the chip to run the max allowed,
|
|
even if only 1 MPI task is assigned. If 8 MPI tasks are assigned to
|
|
the Phi, each will run with 30 threads. If you wish to limit the
|
|
number of threads per MPI task, set <em>tptask</em> to a smaller value.
|
|
E.g. for <em>tptask</em> = 16, if 8 MPI tasks are assigned, each will run
|
|
with 16 threads, for a total of 128.</p>
|
|
<p>Note that the default settings for <em>tpc</em> and <em>tptask</em> are fine for
|
|
most problems, regardless of how many MPI tasks you assign to a Phi.</p>
|
|
<p>The <em>no_affinity</em> keyword will turn off automatic setting of core
|
|
affinity for MPI tasks and OpenMP threads on the host when using
|
|
offload to a coprocessor. Affinity settings are used when possible
|
|
to prevent MPI tasks and OpenMP threads from being on separate NUMA
|
|
domains and to prevent offload threads from interfering with other
|
|
processes/threads used for LAMMPS.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>kokkos</em> style invokes settings associated with the use of the
|
|
KOKKOS package.</p>
|
|
<p>All of the settings are optional keyword/value pairs. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>neigh</em> keyword determines how neighbor lists are built. A value
|
|
of <em>half</em> uses half-neighbor lists, the same as used by most pair
|
|
styles in LAMMPS. A value of <em>half/thread</em> uses a thread-safe variant
|
|
of the half-neighbor list. It should be used instead of <em>half</em> when
|
|
running with more than 1 threads per MPI task on a CPU. A value of
|
|
<em>n2</em> uses an O(N^2) algorithm to build the neighbor list without
|
|
binning, where N = # of atoms on a processor. It is typically slower
|
|
than the other methods, which use binning.</p>
|
|
<p>A value of <em>full</em> uses a full neighbor lists and is the default. This
|
|
performs twice as much computation as the <em>half</em> option, however that
|
|
is often a win because it is thread-safe and doesn’t require atomic
|
|
operations in the calculation of pair forces. For that reason, <em>full</em>
|
|
is the default setting. However, when running in MPI-only mode with 1
|
|
thread per MPI task, <em>half</em> neighbor lists will typically be faster,
|
|
just as it is for non-accelerated pair styles.</p>
|
|
<p>A value of <em>full/cluster</em> is an experimental neighbor style, where
|
|
particles interact with all particles within a small cluster, if at
|
|
least one of the clusters particles is within the neighbor cutoff
|
|
range. This potentially allows for better vectorization on
|
|
architectures such as the Intel Phi. If also reduces the size of the
|
|
neighbor list by roughly a factor of the cluster size, thus reducing
|
|
the total memory footprint considerably.</p>
|
|
<p>The <em>newton</em> keyword sets the Newton flags for pairwise and bonded
|
|
interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><em>newton</em></a>
|
|
command allows. The default is <em>off</em> because this will almost always
|
|
give better performance for the KOKKOS package. This means more
|
|
computation is done, but less communication. However, when running in
|
|
MPI-only mode with 1 thread per MPI task, a value of <em>on</em> will
|
|
typically be faster, just as it is for non-accelerated pair styles.</p>
|
|
<p>The <em>binsize</em> keyword sets the size of bins used to bin atoms in
|
|
neighbor list builds. The same value can be set by the <a class="reference internal" href="neigh_modify.html"><em>neigh_modify binsize</em></a> command. Making it an option in the
|
|
package kokkos command allows it to be set from the command line. The
|
|
default value is 0.0, which means the LAMMPS default will be used,
|
|
which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
|
|
distance. This is fine when neighbor lists are built on the CPU. For
|
|
GPU builds, a 2x larger binsize equal to the pairwise cutoff +
|
|
neighbor skin, is often faster, which can be set by this keyword.
|
|
Note that if you use a longer-than-usual pairwise cutoff, e.g. to
|
|
allow for a smaller fraction of KSpace work with a <a class="reference internal" href="kspace_style.html"><em>long-range Coulombic solver</em></a> because the GPU is faster at
|
|
performing pairwise interactions, then this rule of thumb may give too
|
|
large a binsize.</p>
|
|
<p>The <em>comm</em> and <em>comm/exchange</em> and <em>comm/forward</em> keywords determine
|
|
whether the host or device performs the packing and unpacking of data
|
|
when communicating per-atom data between processors. “Exchange”
|
|
communication happens only on timesteps that neighbor lists are
|
|
rebuilt. The data is only for atoms that migrate to new processors.
|
|
“Forward” communication happens every timestep. The data is for atom
|
|
coordinates and any other atom properties that needs to be updated for
|
|
ghost atoms owned by each processor.</p>
|
|
<p>The <em>comm</em> keyword is simply a short-cut to set the same value
|
|
for both the <em>comm/exchange</em> and <em>comm/forward</em> keywords.</p>
|
|
<p>The value options for all 3 keywords are <em>no</em> or <em>host</em> or <em>device</em>.
|
|
A value of <em>no</em> means to use the standard non-KOKKOS method of
|
|
packing/unpacking data for the communication. A value of <em>host</em> means
|
|
to use the host, typically a multi-core CPU, and perform the
|
|
packing/unpacking in parallel with threads. A value of <em>device</em> means
|
|
to use the device, typically a GPU, to perform the packing/unpacking
|
|
operation.</p>
|
|
<p>The optimal choice for these keywords depends on the input script and
|
|
the hardware used. The <em>no</em> value is useful for verifying that the
|
|
Kokkos-based <em>host</em> and <em>device</em> values are working correctly. It may
|
|
also be the fastest choice when using Kokkos styles in MPI-only mode
|
|
(i.e. with a thread count of 1).</p>
|
|
<p>When running on CPUs or Xeon Phi, the <em>host</em> and <em>device</em> values work
|
|
identically. When using GPUs, the <em>device</em> value will typically be
|
|
optimal if all of your styles used in your input script are supported
|
|
by the KOKKOS package. In this case data can stay on the GPU for many
|
|
timesteps without being moved between the host and GPU, if you use the
|
|
<em>device</em> value. This requires that your MPI is able to access GPU
|
|
memory directly. Currently that is true for OpenMPI 1.8 (or later
|
|
versions), Mvapich2 1.9 (or later), and CrayMPI. If your script uses
|
|
styles (e.g. fixes) which are not yet supported by the KOKKOS package,
|
|
then data has to be move between the host and device anyway, so it is
|
|
typically faster to let the host handle communication, by using the
|
|
<em>host</em> value. Using <em>host</em> instead of <em>no</em> will enable use of
|
|
multiple threads to pack/unpack communicated data.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>omp</em> style invokes settings associated with the use of the
|
|
USER-OMP package.</p>
|
|
<p>The <em>Nthread</em> argument sets the number of OpenMP threads allocated for
|
|
each MPI task. For example, if your system has nodes with dual
|
|
quad-core processors, it has a total of 8 cores per node. You could
|
|
use two MPI tasks per node (e.g. using the -ppn option of the mpirun
|
|
command in MPICH or -npernode in OpenMPI), and set <em>Nthreads</em> = 4.
|
|
This would use all 8 cores on each node. Note that the product of MPI
|
|
tasks * threads/task should not exceed the physical number of cores
|
|
(on a node), otherwise performance will suffer.</p>
|
|
<p>Setting <em>Nthread</em> = 0 instructs LAMMPS to use whatever value is the
|
|
default for the given OpenMP environment. This is usually determined
|
|
via the <em>OMP_NUM_THREADS</em> environment variable or the compiler
|
|
runtime. Note that in most cases the default for OpenMP capable
|
|
compilers is to use one thread for each available CPU core when
|
|
<em>OMP_NUM_THREADS</em> is not explicitly set, which can lead to poor
|
|
performance.</p>
|
|
<p>Here are examples of how to set the environment variable when
|
|
launching LAMMPS:</p>
|
|
<div class="highlight-python"><div class="highlight"><pre>env OMP_NUM_THREADS=4 lmp_machine -sf omp -in in.script
|
|
env OMP_NUM_THREADS=2 mpirun -np 2 lmp_machine -sf omp -in in.script
|
|
mpirun -x OMP_NUM_THREADS=2 -np 2 lmp_machine -sf omp -in in.script
|
|
</pre></div>
|
|
</div>
|
|
<p>or you can set it permanently in your shell’s start-up script.
|
|
All three of these examples use a total of 4 CPU cores.</p>
|
|
<p>Note that different MPI implementations have different ways of passing
|
|
the OMP_NUM_THREADS environment variable to all MPI processes. The
|
|
2nd example line above is for MPICH; the 3rd example line with -x is
|
|
for OpenMPI. Check your MPI documentation for additional details.</p>
|
|
<p>What combination of threads and MPI tasks gives the best performance
|
|
is difficult to predict and can depend on many components of your
|
|
input. Not all features of LAMMPS support OpenMP threading via the
|
|
USER-OMP packaage and the parallel efficiency can be very different,
|
|
too.</p>
|
|
<p>Optional keyword/value pairs can also be specified. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>neigh</em> keyword specifies whether neighbor list building will be
|
|
multi-threaded in addition to force calculations. If <em>neigh</em> is set
|
|
to <em>no</em> then neighbor list calculation is performed only by MPI tasks
|
|
with no OpenMP threading. If <em>mode</em> is <em>yes</em> (the default), a
|
|
multi-threaded neighbor list build is used. Using <em>neigh</em> = <em>yes</em> is
|
|
almost always faster and should produce idential neighbor lists at the
|
|
expense of using more memory. Specifically, neighbor list pages are
|
|
allocated for all threads at the same time and each thread works
|
|
within its own pages.</p>
|
|
</div>
|
|
<hr class="docutils" />
|
|
<div class="section" id="restrictions">
|
|
<h2>Restrictions<a class="headerlink" href="#restrictions" title="Permalink to this headline">¶</a></h2>
|
|
<p>This command cannot be used after the simulation box is defined by a
|
|
<a class="reference internal" href="read_data.html"><em>read_data</em></a> or <a class="reference internal" href="create_box.html"><em>create_box</em></a> command.</p>
|
|
<p>The cuda style of this command can only be invoked if LAMMPS was built
|
|
with the USER-CUDA package. See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
|
|
<p>The gpu style of this command can only be invoked if LAMMPS was built
|
|
with the GPU package. See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
|
|
<p>The intel style of this command can only be invoked if LAMMPS was
|
|
built with the USER-INTEL package. See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
|
|
<p>The kk style of this command can only be invoked if LAMMPS was built
|
|
with the KOKKOS package. See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
|
|
<p>The omp style of this command can only be invoked if LAMMPS was built
|
|
with the USER-OMP package. See the <a class="reference internal" href="Section_start.html#start-3"><span>Making LAMMPS</span></a> section for more info.</p>
|
|
</div>
|
|
<div class="section" id="related-commands">
|
|
<h2>Related commands<a class="headerlink" href="#related-commands" title="Permalink to this headline">¶</a></h2>
|
|
<p><a class="reference internal" href="suffix.html"><em>suffix</em></a>, “-pk” <a class="reference internal" href="Section_start.html#start-7"><span>command-line setting</span></a></p>
|
|
</div>
|
|
<div class="section" id="default">
|
|
<h2>Default<a class="headerlink" href="#default" title="Permalink to this headline">¶</a></h2>
|
|
<p>For the USER-CUDA package, the default is Ngpu = 1 and the option
|
|
defaults are newton = off, gpuID = 0 to Ngpu-1, timing = not enabled,
|
|
test = not enabled, and thread = auto. These settings are made
|
|
automatically by the required “-c on” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>. You can change them bu using the
|
|
package cuda command in your input script or via the “-pk cuda”
|
|
<a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
|
|
<p>For the GPU package, the default is Ngpu = 1 and the option defaults
|
|
are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
|
|
to Ngpu-1, tpa = 1, and device = not used. These settings are made
|
|
automatically if the “-sf gpu” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is used. If it is not used, you
|
|
must invoke the package gpu command in your input script or via the
|
|
“-pk gpu” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
|
|
<p>For the USER-INTEL package, the default is Nphi = 1 and the option
|
|
defaults are omp = 0, mode = mixed, balance = -1, tpc = 4, tptask =
|
|
240. The default ghost option is determined by the pair style being
|
|
used. This value is output to the screen in the offload report at the
|
|
end of each run. Note that all of these settings, except “omp” and
|
|
“mode”, are ignored if LAMMPS was not built with Xeon Phi coprocessor
|
|
support. These settings are made automatically if the “-sf intel”
|
|
<a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is used. If it is
|
|
not used, you must invoke the package intel command in your input
|
|
script or or via the “-pk intel” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
|
|
<p>For the KOKKOS package, the option defaults neigh = full, newton =
|
|
off, binsize = 0.0, and comm = device. These settings are made
|
|
automatically by the required “-k on” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>. You can change them bu using the
|
|
package kokkos command in your input script or via the “-pk kokkos”
|
|
<a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
|
|
<p>For the OMP package, the default is Nthreads = 0 and the option
|
|
defaults are neigh = yes. These settings are made automatically if
|
|
the “-sf omp” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a> is
|
|
used. If it is not used, you must invoke the package omp command in
|
|
your input script or via the “-pk omp” <a class="reference internal" href="Section_start.html#start-7"><span>command-line switch</span></a>.</p>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
<footer>
|
|
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<p>
|
|
© Copyright 2013 Sandia Corporation.
|
|
</p>
|
|
</div>
|
|
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
|
|
|
</footer>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</section>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript">
|
|
var DOCUMENTATION_OPTIONS = {
|
|
URL_ROOT:'./',
|
|
VERSION:'',
|
|
COLLAPSE_INDEX:false,
|
|
FILE_SUFFIX:'.html',
|
|
HAS_SOURCE: true
|
|
};
|
|
</script>
|
|
<script type="text/javascript" src="_static/jquery.js"></script>
|
|
<script type="text/javascript" src="_static/underscore.js"></script>
|
|
<script type="text/javascript" src="_static/doctools.js"></script>
|
|
<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/jquery-1.11.0.min.js"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/lightbox.min.js"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2-customize/jquery-noconflict.js"></script>
|
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript" src="_static/js/theme.js"></script>
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript">
|
|
jQuery(function () {
|
|
SphinxRtdTheme.StickyNav.enable();
|
|
});
|
|
</script>
|
|
|
|
|
|
</body>
|
|
</html> |