forked from lijiext/lammps
714 lines
43 KiB
HTML
714 lines
43 KiB
HTML
|
|
|
|
<!DOCTYPE html>
|
|
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
|
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
|
<head>
|
|
<meta charset="utf-8">
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
|
<title>package command — LAMMPS documentation</title>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="_static/sphinxcontrib-images/LightBox2/lightbox2/css/lightbox.css" type="text/css" />
|
|
|
|
|
|
|
|
<link rel="top" title="LAMMPS documentation" href="index.html"/>
|
|
|
|
|
|
<script src="_static/js/modernizr.min.js"></script>
|
|
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav" role="document">
|
|
|
|
<div class="wy-grid-for-nav">
|
|
|
|
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
|
<div class="wy-side-nav-search">
|
|
|
|
|
|
|
|
<a href="Manual.html" class="icon icon-home"> LAMMPS
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<div role="search">
|
|
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" />
|
|
<input type="hidden" name="check_keywords" value="yes" />
|
|
<input type="hidden" name="area" value="default" />
|
|
</form>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
|
|
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
|
|
|
|
|
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_intro.html">1. Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_start.html">2. Getting Started</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_commands.html">3. Commands</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_packages.html">4. Packages</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_accelerate.html">5. Accelerating LAMMPS performance</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_howto.html">6. How-to discussions</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_example.html">7. Example problems</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_perf.html">8. Performance & scalability</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_tools.html">9. Additional tools</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_modify.html">10. Modifying & extending LAMMPS</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_python.html">11. Python interface to LAMMPS</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_errors.html">12. Errors</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_history.html">13. Future and history</a></li>
|
|
</ul>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
|
|
|
|
|
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="Manual.html">LAMMPS</a>
|
|
</nav>
|
|
|
|
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="breadcrumbs navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="Manual.html">Docs</a> »</li>
|
|
|
|
<li>package command</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
|
|
|
|
<a href="http://lammps.sandia.gov">Website</a>
|
|
<a href="Section_commands.html#comm">Commands</a>
|
|
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
|
|
</div>
|
|
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
|
<div itemprop="articleBody">
|
|
|
|
<div class="section" id="package-command">
|
|
<span id="index-0"></span><h1>package command</h1>
|
|
<div class="section" id="syntax">
|
|
<h2>Syntax</h2>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">package</span> <span class="n">style</span> <span class="n">args</span>
|
|
</pre></div>
|
|
</div>
|
|
<ul class="simple">
|
|
<li>style = <em>gpu</em> or <em>intel</em> or <em>kokkos</em> or <em>omp</em></li>
|
|
<li>args = arguments specific to the style</li>
|
|
</ul>
|
|
<pre class="literal-block">
|
|
<em>gpu</em> args = Ngpu keyword value ...
|
|
Ngpu = # of GPUs per node
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>neigh</em> or <em>newton</em> or <em>binsize</em> or <em>split</em> or <em>gpuID</em> or <em>tpa</em> or <em>device</em> or <em>blocksize</em>
|
|
<em>neigh</em> value = <em>yes</em> or <em>no</em>
|
|
yes = neighbor list build on GPU (default)
|
|
no = neighbor list build on CPU
|
|
<em>newton</em> = <em>off</em> or <em>on</em>
|
|
off = set Newton pairwise flag off (default and required)
|
|
on = set Newton pairwise flag on (currently not allowed)
|
|
<em>binsize</em> value = size
|
|
size = bin size for neighbor list construction (distance units)
|
|
<em>split</em> = fraction
|
|
fraction = fraction of atoms assigned to GPU (default = 1.0)
|
|
<em>gpuID</em> values = first last
|
|
first = ID of first GPU to be used on each node
|
|
last = ID of last GPU to be used on each node
|
|
<em>tpa</em> value = Nthreads
|
|
Nthreads = # of GPU threads used per atom
|
|
<em>device</em> value = device_type
|
|
device_type = <em>kepler</em> or <em>fermi</em> or <em>cypress</em> or <em>generic</em>
|
|
<em>blocksize</em> value = size
|
|
size = thread block size for pair force computation
|
|
<em>intel</em> args = NPhi keyword value ...
|
|
Nphi = # of coprocessors per node
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>mode</em> or <em>omp</em> or <em>lrt</em> or <em>balance</em> or <em>ghost</em> or <em>tpc</em> or <em>tptask</em> or <em>no_affinity</em>
|
|
<em>mode</em> value = <em>single</em> or <em>mixed</em> or <em>double</em>
|
|
single = perform force calculations in single precision
|
|
mixed = perform force calculations in mixed precision
|
|
double = perform force calculations in double precision
|
|
<em>omp</em> value = Nthreads
|
|
Nthreads = number of OpenMP threads to use on CPU (default = 0)
|
|
<em>lrt</em> value = <em>yes</em> or <em>no</em>
|
|
yes = use additional thread dedicated for some PPPM calculations
|
|
no = do not dedicate an extra thread for some PPPM calculations
|
|
<em>balance</em> value = split
|
|
split = fraction of work to offload to coprocessor, -1 for dynamic
|
|
<em>ghost</em> value = <em>yes</em> or <em>no</em>
|
|
yes = include ghost atoms for offload
|
|
no = do not include ghost atoms for offload
|
|
<em>tpc</em> value = Ntpc
|
|
Ntpc = max number of coprocessor threads per coprocessor core (default = 4)
|
|
<em>tptask</em> value = Ntptask
|
|
Ntptask = max number of coprocessor threads per MPI task (default = 240)
|
|
<em>no_affinity</em> values = none
|
|
<em>kokkos</em> args = keyword value ...
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>neigh</em> or <em>newton</em> or <em>binsize</em> or <em>comm</em> or <em>comm/exchange</em> or <em>comm/forward</em>
|
|
<em>neigh</em> value = <em>full</em> or <em>half</em> or <em>n2</em> or <em>full/cluster</em>
|
|
full = full neighbor list
|
|
half = half neighbor list built in thread-safe manner
|
|
n2 = non-binning neighbor list build, O(N^2) algorithm
|
|
full/cluster = full neighbor list with clustered groups of atoms
|
|
<em>newton</em> = <em>off</em> or <em>on</em>
|
|
off = set Newton pairwise and bonded flags off (default)
|
|
on = set Newton pairwise and bonded flags on
|
|
<em>binsize</em> value = size
|
|
size = bin size for neighbor list construction (distance units)
|
|
<em>comm</em> value = <em>no</em> or <em>host</em> or <em>device</em>
|
|
use value for both comm/exchange and comm/forward
|
|
<em>comm/exchange</em> value = <em>no</em> or <em>host</em> or <em>device</em>
|
|
<em>comm/forward</em> value = <em>no</em> or <em>host</em> or <em>device</em>
|
|
no = perform communication pack/unpack in non-KOKKOS mode
|
|
host = perform pack/unpack on host (e.g. with OpenMP threading)
|
|
device = perform pack/unpack on device (e.g. on GPU)
|
|
<em>omp</em> args = Nthreads keyword value ...
|
|
Nthread = # of OpenMP threads to associate with each MPI process
|
|
zero or more keyword/value pairs may be appended
|
|
keywords = <em>neigh</em>
|
|
<em>neigh</em> value = <em>yes</em> or <em>no</em>
|
|
yes = threaded neighbor list build (default)
|
|
no = non-threaded neighbor list build
|
|
</pre>
|
|
</div>
|
|
<div class="section" id="examples">
|
|
<h2>Examples</h2>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">package</span> <span class="n">gpu</span> <span class="mi">1</span>
|
|
<span class="n">package</span> <span class="n">gpu</span> <span class="mi">1</span> <span class="n">split</span> <span class="mf">0.75</span>
|
|
<span class="n">package</span> <span class="n">gpu</span> <span class="mi">2</span> <span class="n">split</span> <span class="o">-</span><span class="mf">1.0</span>
|
|
<span class="n">package</span> <span class="n">kokkos</span> <span class="n">neigh</span> <span class="n">half</span> <span class="n">comm</span> <span class="n">device</span>
|
|
<span class="n">package</span> <span class="n">omp</span> <span class="mi">0</span> <span class="n">neigh</span> <span class="n">no</span>
|
|
<span class="n">package</span> <span class="n">omp</span> <span class="mi">4</span>
|
|
<span class="n">package</span> <span class="n">intel</span> <span class="mi">1</span>
|
|
<span class="n">package</span> <span class="n">intel</span> <span class="mi">2</span> <span class="n">omp</span> <span class="mi">4</span> <span class="n">mode</span> <span class="n">mixed</span> <span class="n">balance</span> <span class="mf">0.5</span>
|
|
</pre></div>
|
|
</div>
|
|
</div>
|
|
<div class="section" id="description">
|
|
<h2>Description</h2>
|
|
<p>This command invokes package-specific settings for the various
|
|
accelerator packages available in LAMMPS. Currently the following
|
|
packages use settings from this command: GPU, USER-INTEL, KOKKOS, and
|
|
USER-OMP.</p>
|
|
<p>If this command is specified in an input script, it must be near the
|
|
top of the script, before the simulation box has been defined. This
|
|
is because it specifies settings that the accelerator packages use in
|
|
their intialization, before a simultion is defined.</p>
|
|
<p>This command can also be specified from the command-line when
|
|
launching LAMMPS, using the “-pk” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>. The syntax is exactly the same as
|
|
when used in an input script.</p>
|
|
<p>Note that all of the accelerator packages require the package command
|
|
to be specified (except the OPT package), if the package is to be used
|
|
in a simulation (LAMMPS can be built with an accelerator package
|
|
without using it in a particular simulation). However, in all cases,
|
|
a default version of the command is typically invoked by other
|
|
accelerator settings.</p>
|
|
<p>The KOKKOS package requires a “-k on” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> respectively, which invokes a
|
|
“package kokkos” command with default settings.</p>
|
|
<p>For the GPU, USER-INTEL, and USER-OMP packages, if a “-sf gpu” or “-sf
|
|
intel” or “-sf omp” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>
|
|
is used to auto-append accelerator suffixes to various styles in the
|
|
input script, then those switches also invoke a “package gpu”,
|
|
“package intel”, or “package omp” command with default settings.</p>
|
|
<div class="admonition note">
|
|
<p class="first admonition-title">Note</p>
|
|
<p class="last">A package command for a particular style can be invoked multiple
|
|
times when a simulation is setup, e.g. by the “-c on”, “-k on”, “-sf”,
|
|
and “-pk” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switches</span></a>, and by
|
|
using this command in an input script. Each time it is used all of
|
|
the style options are set, either to default values or to specified
|
|
settings. I.e. settings from previous invocations do not persist
|
|
across multiple invocations.</p>
|
|
</div>
|
|
<p>See the <a class="reference internal" href="Section_accelerate.html"><span class="doc">Section Accelerate</span></a> section of the
|
|
manual for more details about using the various accelerator packages
|
|
for speeding up LAMMPS simulations.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>gpu</em> style invokes settings associated with the use of the GPU
|
|
package.</p>
|
|
<p>The <em>Ngpu</em> argument sets the number of GPUs per node. There must be
|
|
at least as many MPI tasks per node as GPUs, as set by the mpirun or
|
|
mpiexec command. If there are more MPI tasks (per node)
|
|
than GPUs, multiple MPI tasks will share each GPU.</p>
|
|
<p>Optional keyword/value pairs can also be specified. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>neigh</em> keyword specifies where neighbor lists for pair style
|
|
computation will be built. If <em>neigh</em> is <em>yes</em>, which is the default,
|
|
neighbor list building is performed on the GPU. If <em>neigh</em> is <em>no</em>,
|
|
neighbor list building is performed on the CPU. GPU neighbor list
|
|
building currently cannot be used with a triclinic box. GPU neighbor
|
|
list calculation currently cannot be used with
|
|
<a class="reference internal" href="pair_hybrid.html"><span class="doc">hybrid</span></a> pair styles. GPU neighbor lists are not
|
|
compatible with comannds that are not GPU-enabled. When a non-GPU
|
|
enabled command requires a neighbor list, it will also be built on the
|
|
CPU. In these cases, it will typically be more efficient to only use
|
|
CPU neighbor list builds.</p>
|
|
<p>The <em>newton</em> keyword sets the Newton flags for pairwise (not bonded)
|
|
interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><span class="doc">newton</span></a>
|
|
command allows. Currently, only an <em>off</em> value is allowed, since all
|
|
the GPU package pair styles require this setting. This means more
|
|
computation is done, but less communication. In the future a value of
|
|
<em>on</em> may be allowed, so the <em>newton</em> keyword is included as an option
|
|
for compatibility with the package command for other accelerator
|
|
styles. Note that the newton setting for bonded interactions is not
|
|
affected by this keyword.</p>
|
|
<p>The <em>binsize</em> keyword sets the size of bins used to bin atoms in
|
|
neighbor list builds performed on the GPU, if <em>neigh</em> = <em>yes</em> is set.
|
|
If <em>binsize</em> is set to 0.0 (the default), then bins = the size of the
|
|
pairwise cutoff + neighbor skin distance. This is 2x larger than the
|
|
LAMMPS default used for neighbor list building on the CPU. This will
|
|
be close to optimal for the GPU, so you do not normally need to use
|
|
this keyword. Note that if you use a longer-than-usual pairwise
|
|
cutoff, e.g. to allow for a smaller fraction of KSpace work with a
|
|
<a class="reference internal" href="kspace_style.html"><span class="doc">long-range Coulombic solver</span></a> because the GPU is
|
|
faster at performing pairwise interactions, then it may be optimal to
|
|
make the <em>binsize</em> smaller than the default. For example, with a
|
|
cutoff of 20*sigma in LJ <a class="reference internal" href="units.html"><span class="doc">units</span></a> and a neighbor skin
|
|
distance of sigma, a <em>binsize</em> = 5.25*sigma can be more efficient than
|
|
the default.</p>
|
|
<p>The <em>split</em> keyword can be used for load balancing force calculations
|
|
between CPU and GPU cores in GPU-enabled pair styles. If 0 < <em>split</em> <
|
|
1.0, a fixed fraction of particles is offloaded to the GPU while force
|
|
calculation for the other particles occurs simulataneously on the CPU.
|
|
If <em>split</em> < 0.0, the optimal fraction (based on CPU and GPU timings)
|
|
is calculated every 25 timesteps, i.e. dynamic load-balancing across
|
|
the CPU and GPU is performed. If <em>split</em> = 1.0, all force
|
|
calculations for GPU accelerated pair styles are performed on the GPU.
|
|
In this case, other <a class="reference internal" href="pair_hybrid.html"><span class="doc">hybrid</span></a> pair interactions,
|
|
<a class="reference internal" href="bond_style.html"><span class="doc">bond</span></a>, <a class="reference internal" href="angle_style.html"><span class="doc">angle</span></a>,
|
|
<a class="reference internal" href="dihedral_style.html"><span class="doc">dihedral</span></a>, <a class="reference internal" href="improper_style.html"><span class="doc">improper</span></a>, and
|
|
<a class="reference internal" href="kspace_style.html"><span class="doc">long-range</span></a> calculations can be performed on the
|
|
CPU while the GPU is performing force calculations for the GPU-enabled
|
|
pair style. If all CPU force computations complete before the GPU
|
|
completes, LAMMPS will block until the GPU has finished before
|
|
continuing the timestep.</p>
|
|
<p>As an example, if you have two GPUs per node and 8 CPU cores per node,
|
|
and would like to run on 4 nodes (32 cores) with dynamic balancing of
|
|
force calculation across CPU and GPU cores, you could specify</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">mpirun</span> <span class="o">-</span><span class="n">np</span> <span class="mi">32</span> <span class="o">-</span><span class="n">sf</span> <span class="n">gpu</span> <span class="o">-</span><span class="ow">in</span> <span class="ow">in</span><span class="o">.</span><span class="n">script</span> <span class="c1"># launch command</span>
|
|
<span class="n">package</span> <span class="n">gpu</span> <span class="mi">2</span> <span class="n">split</span> <span class="o">-</span><span class="mi">1</span> <span class="c1"># input script command</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>In this case, all CPU cores and GPU devices on the nodes would be
|
|
utilized. Each GPU device would be shared by 4 CPU cores. The CPU
|
|
cores would perform force calculations for some fraction of the
|
|
particles at the same time the GPUs performed force calculation for
|
|
the other particles.</p>
|
|
<p>The <em>gpuID</em> keyword allows selection of which GPUs on each node will
|
|
be used for a simulation. The <em>first</em> and <em>last</em> values specify the
|
|
GPU IDs to use (from 0 to Ngpu-1). By default, first = 0 and last =
|
|
Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
|
|
of physical GPUs. If you only wish to use a subset, set Ngpu to a
|
|
smaller number and first/last to a sub-range of the available GPUs.</p>
|
|
<p>The <em>tpa</em> keyword sets the number of GPU thread per atom used to
|
|
perform force calculations. With a default value of 1, the number of
|
|
threads will be chosen based on the pair style, however, the value can
|
|
be set explicitly with this keyword to fine-tune performance. For
|
|
large cutoffs or with a small number of particles per GPU, increasing
|
|
the value can improve performance. The number of threads per atom must
|
|
be a power of 2 and currently cannot be greater than 32.</p>
|
|
<p>The <em>device</em> keyword can be used to tune parameters optimized for a
|
|
specific accelerator, when using OpenCL. For CUDA, the <em>device</em>
|
|
keyword is ignored. Currently, the device type is limited to NVIDIA
|
|
Kepler, NVIDIA Fermi, AMD Cypress, or a generic device. More devices
|
|
may be added later. The default device type can be specified when
|
|
building LAMMPS with the GPU library, via settings in the
|
|
lib/gpu/Makefile that is used.</p>
|
|
<p>The <em>blocksize</em> keyword allows you to tweak the number of threads used
|
|
per thread block. This number should be a multiple of 32 (for GPUs)
|
|
and its maximum depends on the specific GPU hardware. Typical choices
|
|
are 64, 128, or 256. A larger blocksize increases occupancy of
|
|
individual GPU cores, but reduces the total number of thread blocks,
|
|
thus may lead to load imbalance.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>intel</em> style invokes settings associated with the use of the
|
|
USER-INTEL package. All of its settings, except the <em>omp</em> and <em>mode</em>
|
|
keywords, are ignored if LAMMPS was not built with Xeon Phi
|
|
coprocessor support. All of its settings, including the <em>omp</em> and
|
|
<em>mode</em> keyword are applicable if LAMMPS was built with coprocessor
|
|
support.</p>
|
|
<p>The <em>Nphi</em> argument sets the number of coprocessors per node.
|
|
This can be set to any value, including 0, if LAMMPS was not
|
|
built with coprocessor support.</p>
|
|
<p>Optional keyword/value pairs can also be specified. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>omp</em> keyword determines the number of OpenMP threads allocated
|
|
for each MPI task when any portion of the interactions computed by a
|
|
USER-INTEL pair style are run on the CPU. This can be the case even
|
|
if LAMMPS was built with coprocessor support; see the <em>balance</em>
|
|
keyword discussion below. If you are running with less MPI tasks/node
|
|
than there are CPUs, it can be advantageous to use OpenMP threading on
|
|
the CPUs.</p>
|
|
<div class="admonition note">
|
|
<p class="first admonition-title">Note</p>
|
|
<p class="last">The <em>omp</em> keyword has nothing to do with coprocessor threads on
|
|
the Xeon Phi; see the <em>tpc</em> and <em>tptask</em> keywords below for a
|
|
discussion of coprocessor threads.</p>
|
|
</div>
|
|
<p>The <em>Nthread</em> value for the <em>omp</em> keyword sets the number of OpenMP
|
|
threads allocated for each MPI task. Setting <em>Nthread</em> = 0 (the
|
|
default) instructs LAMMPS to use whatever value is the default for the
|
|
given OpenMP environment. This is usually determined via the
|
|
<em>OMP_NUM_THREADS</em> environment variable or the compiler runtime, which
|
|
is usually a value of 1.</p>
|
|
<p>For more details, including examples of how to set the OMP_NUM_THREADS
|
|
environment variable, see the discussion of the <em>Nthreads</em> setting on
|
|
this doc page for the “package omp” command. Nthreads is a required
|
|
argument for the USER-OMP package. Its meaning is exactly the same
|
|
for the USER-INTEL pacakge.</p>
|
|
<div class="admonition note">
|
|
<p class="first admonition-title">Note</p>
|
|
<p class="last">If you build LAMMPS with both the USER-INTEL and USER-OMP
|
|
packages, be aware that both packages allow setting of the <em>Nthreads</em>
|
|
value via their package commands, but there is only a single global
|
|
<em>Nthreads</em> value used by OpenMP. Thus if both package commands are
|
|
invoked, you should insure the two values are consistent. If they are
|
|
not, the last one invoked will take precedence, for both packages.
|
|
Also note that if the “-sf hybrid intel omp” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> is used, it invokes a “package
|
|
intel” command, followed by a “package omp” command, both with a
|
|
setting of <em>Nthreads</em> = 0.</p>
|
|
</div>
|
|
<p>The <em>mode</em> keyword determines the precision mode to use for
|
|
computing pair style forces, either on the CPU or on the coprocessor,
|
|
when using a USER-INTEL supported <a class="reference internal" href="pair_style.html"><span class="doc">pair style</span></a>. It
|
|
can take a value of <em>single</em>, <em>mixed</em> which is the default, or
|
|
<em>double</em>. <em>Single</em> means single precision is used for the entire
|
|
force calculation. <em>Mixed</em> means forces between a pair of atoms are
|
|
computed in single precision, but accumulated and stored in double
|
|
precision, including storage of forces, torques, energies, and virial
|
|
quantities. <em>Double</em> means double precision is used for the entire
|
|
force calculation.</p>
|
|
<p>The <em>lrt</em> keyword can be used to enable “Long Range Thread (LRT)”
|
|
mode. It can take a value of <em>yes</em> to enable and <em>no</em> to disable.
|
|
LRT mode generates an extra thread (in addition to any OpenMP threads
|
|
specified with the OMP_NUM_THREADS environment variable or the <em>omp</em>
|
|
keyword). The extra thread is dedicated for performing part of the
|
|
<a class="reference internal" href="kspace_style.html"><span class="doc">PPPM solver</span></a> computations and communications. This
|
|
can improve parallel performance on processors supporting
|
|
Simultaneous Multithreading (SMT) such as Hyperthreading on Intel
|
|
processors. In this mode, one additional thread is generated per MPI
|
|
process. LAMMPS will generate a warning in the case that more threads
|
|
are used than available in SMT hardware on a node. If the PPPM solver
|
|
from the USER-INTEL package is not used, then the LRT setting is
|
|
ignored and no extra threads are generated. Enabling LRT will replace
|
|
the <a class="reference internal" href="run_style.html"><span class="doc">run_style</span></a> with the <em>verlet/lrt/intel</em> style that
|
|
is identical to the default <em>verlet</em> style aside from supporting the
|
|
LRT feature.</p>
|
|
<p>The <em>balance</em> keyword sets the fraction of <a class="reference internal" href="pair_style.html"><span class="doc">pair style</span></a> work offloaded to the coprocessor for split
|
|
values between 0.0 and 1.0 inclusive. While this fraction of work is
|
|
running on the coprocessor, other calculations will run on the host,
|
|
including neighbor and pair calculations that are not offloaded, as
|
|
well as angle, bond, dihedral, kspace, and some MPI communications.
|
|
If <em>split</em> is set to -1, the fraction of work is dynamically adjusted
|
|
automatically throughout the run. This typically give performance
|
|
within 5 to 10 percent of the optimal fixed fraction.</p>
|
|
<p>The <em>ghost</em> keyword determines whether or not ghost atoms, i.e. atoms
|
|
at the boundaries of proessor sub-domains, are offloaded for neighbor
|
|
and force calculations. When the value = “no”, ghost atoms are not
|
|
offloaded. This option can reduce the amount of data transfer with
|
|
the coprocessor and can also overlap MPI communication of forces with
|
|
computation on the coprocessor when the <a class="reference internal" href="newton.html"><span class="doc">newton pair</span></a>
|
|
setting is “on”. When the value = “yes”, ghost atoms are offloaded.
|
|
In some cases this can provide better performance, especially if the
|
|
<em>balance</em> fraction is high.</p>
|
|
<p>The <em>tpc</em> keyword sets the max # of coprocessor threads <em>Ntpc</em> that
|
|
will run on each core of the coprocessor. The default value = 4,
|
|
which is the number of hardware threads per core supported by the
|
|
current generation Xeon Phi chips.</p>
|
|
<p>The <em>tptask</em> keyword sets the max # of coprocessor threads (Ntptask*
|
|
assigned to each MPI task. The default value = 240, which is the
|
|
total # of threads an entire current generation Xeon Phi chip can run
|
|
(240 = 60 cores * 4 threads/core). This means each MPI task assigned
|
|
to the Phi will enough threads for the chip to run the max allowed,
|
|
even if only 1 MPI task is assigned. If 8 MPI tasks are assigned to
|
|
the Phi, each will run with 30 threads. If you wish to limit the
|
|
number of threads per MPI task, set <em>tptask</em> to a smaller value.
|
|
E.g. for <em>tptask</em> = 16, if 8 MPI tasks are assigned, each will run
|
|
with 16 threads, for a total of 128.</p>
|
|
<p>Note that the default settings for <em>tpc</em> and <em>tptask</em> are fine for
|
|
most problems, regardless of how many MPI tasks you assign to a Phi.</p>
|
|
<p>The <em>no_affinity</em> keyword will turn off automatic setting of core
|
|
affinity for MPI tasks and OpenMP threads on the host when using
|
|
offload to a coprocessor. Affinity settings are used when possible
|
|
to prevent MPI tasks and OpenMP threads from being on separate NUMA
|
|
domains and to prevent offload threads from interfering with other
|
|
processes/threads used for LAMMPS.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>kokkos</em> style invokes settings associated with the use of the
|
|
KOKKOS package.</p>
|
|
<p>All of the settings are optional keyword/value pairs. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>neigh</em> keyword determines how neighbor lists are built. A value
|
|
of <em>half</em> uses a thread-safe variant of half-neighbor lists,
|
|
the same as used by most pair styles in LAMMPS. A value of
|
|
<em>n2</em> uses an O(N^2) algorithm to build the neighbor list without
|
|
binning, where N = # of atoms on a processor. It is typically slower
|
|
than the other methods, which use binning.</p>
|
|
<p>A value of <em>full</em> uses a full neighbor lists and is the default. This
|
|
performs twice as much computation as the <em>half</em> option, however that
|
|
is often a win because it is thread-safe and doesn’t require atomic
|
|
operations in the calculation of pair forces. For that reason, <em>full</em>
|
|
is the default setting. However, when running in MPI-only mode with 1
|
|
thread per MPI task, <em>half</em> neighbor lists will typically be faster,
|
|
just as it is for non-accelerated pair styles.</p>
|
|
<p>A value of <em>full/cluster</em> is an experimental neighbor style, where
|
|
particles interact with all particles within a small cluster, if at
|
|
least one of the clusters particles is within the neighbor cutoff
|
|
range. This potentially allows for better vectorization on
|
|
architectures such as the Intel Phi. If also reduces the size of the
|
|
neighbor list by roughly a factor of the cluster size, thus reducing
|
|
the total memory footprint considerably.</p>
|
|
<p>The <em>newton</em> keyword sets the Newton flags for pairwise and bonded
|
|
interactions to <em>off</em> or <em>on</em>, the same as the <a class="reference internal" href="newton.html"><span class="doc">newton</span></a>
|
|
command allows. The default is <em>off</em> because this will almost always
|
|
give better performance for the KOKKOS package. This means more
|
|
computation is done, but less communication. However, when running in
|
|
MPI-only mode with 1 thread per MPI task, a value of <em>on</em> will
|
|
typically be faster, just as it is for non-accelerated pair styles.</p>
|
|
<p>The <em>binsize</em> keyword sets the size of bins used to bin atoms in
|
|
neighbor list builds. The same value can be set by the <a class="reference internal" href="neigh_modify.html"><span class="doc">neigh_modify binsize</span></a> command. Making it an option in the
|
|
package kokkos command allows it to be set from the command line. The
|
|
default value is 0.0, which means the LAMMPS default will be used,
|
|
which is bins = 1/2 the size of the pairwise cutoff + neighbor skin
|
|
distance. This is fine when neighbor lists are built on the CPU. For
|
|
GPU builds, a 2x larger binsize equal to the pairwise cutoff +
|
|
neighbor skin, is often faster, which can be set by this keyword.
|
|
Note that if you use a longer-than-usual pairwise cutoff, e.g. to
|
|
allow for a smaller fraction of KSpace work with a <a class="reference internal" href="kspace_style.html"><span class="doc">long-range Coulombic solver</span></a> because the GPU is faster at
|
|
performing pairwise interactions, then this rule of thumb may give too
|
|
large a binsize.</p>
|
|
<p>The <em>comm</em> and <em>comm/exchange</em> and <em>comm/forward</em> keywords determine
|
|
whether the host or device performs the packing and unpacking of data
|
|
when communicating per-atom data between processors. “Exchange”
|
|
communication happens only on timesteps that neighbor lists are
|
|
rebuilt. The data is only for atoms that migrate to new processors.
|
|
“Forward” communication happens every timestep. The data is for atom
|
|
coordinates and any other atom properties that needs to be updated for
|
|
ghost atoms owned by each processor.</p>
|
|
<p>The <em>comm</em> keyword is simply a short-cut to set the same value
|
|
for both the <em>comm/exchange</em> and <em>comm/forward</em> keywords.</p>
|
|
<p>The value options for all 3 keywords are <em>no</em> or <em>host</em> or <em>device</em>.
|
|
A value of <em>no</em> means to use the standard non-KOKKOS method of
|
|
packing/unpacking data for the communication. A value of <em>host</em> means
|
|
to use the host, typically a multi-core CPU, and perform the
|
|
packing/unpacking in parallel with threads. A value of <em>device</em> means
|
|
to use the device, typically a GPU, to perform the packing/unpacking
|
|
operation.</p>
|
|
<p>The optimal choice for these keywords depends on the input script and
|
|
the hardware used. The <em>no</em> value is useful for verifying that the
|
|
Kokkos-based <em>host</em> and <em>device</em> values are working correctly. It may
|
|
also be the fastest choice when using Kokkos styles in MPI-only mode
|
|
(i.e. with a thread count of 1).</p>
|
|
<p>When running on CPUs or Xeon Phi, the <em>host</em> and <em>device</em> values work
|
|
identically. When using GPUs, the <em>device</em> value will typically be
|
|
optimal if all of your styles used in your input script are supported
|
|
by the KOKKOS package. In this case data can stay on the GPU for many
|
|
timesteps without being moved between the host and GPU, if you use the
|
|
<em>device</em> value. This requires that your MPI is able to access GPU
|
|
memory directly. Currently that is true for OpenMPI 1.8 (or later
|
|
versions), Mvapich2 1.9 (or later), and CrayMPI. If your script uses
|
|
styles (e.g. fixes) which are not yet supported by the KOKKOS package,
|
|
then data has to be move between the host and device anyway, so it is
|
|
typically faster to let the host handle communication, by using the
|
|
<em>host</em> value. Using <em>host</em> instead of <em>no</em> will enable use of
|
|
multiple threads to pack/unpack communicated data.</p>
|
|
<hr class="docutils" />
|
|
<p>The <em>omp</em> style invokes settings associated with the use of the
|
|
USER-OMP package.</p>
|
|
<p>The <em>Nthread</em> argument sets the number of OpenMP threads allocated for
|
|
each MPI task. For example, if your system has nodes with dual
|
|
quad-core processors, it has a total of 8 cores per node. You could
|
|
use two MPI tasks per node (e.g. using the -ppn option of the mpirun
|
|
command in MPICH or -npernode in OpenMPI), and set <em>Nthreads</em> = 4.
|
|
This would use all 8 cores on each node. Note that the product of MPI
|
|
tasks * threads/task should not exceed the physical number of cores
|
|
(on a node), otherwise performance will suffer.</p>
|
|
<p>Setting <em>Nthread</em> = 0 instructs LAMMPS to use whatever value is the
|
|
default for the given OpenMP environment. This is usually determined
|
|
via the <em>OMP_NUM_THREADS</em> environment variable or the compiler
|
|
runtime. Note that in most cases the default for OpenMP capable
|
|
compilers is to use one thread for each available CPU core when
|
|
<em>OMP_NUM_THREADS</em> is not explicitly set, which can lead to poor
|
|
performance.</p>
|
|
<p>Here are examples of how to set the environment variable when
|
|
launching LAMMPS:</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">env</span> <span class="n">OMP_NUM_THREADS</span><span class="o">=</span><span class="mi">4</span> <span class="n">lmp_machine</span> <span class="o">-</span><span class="n">sf</span> <span class="n">omp</span> <span class="o">-</span><span class="ow">in</span> <span class="ow">in</span><span class="o">.</span><span class="n">script</span>
|
|
<span class="n">env</span> <span class="n">OMP_NUM_THREADS</span><span class="o">=</span><span class="mi">2</span> <span class="n">mpirun</span> <span class="o">-</span><span class="n">np</span> <span class="mi">2</span> <span class="n">lmp_machine</span> <span class="o">-</span><span class="n">sf</span> <span class="n">omp</span> <span class="o">-</span><span class="ow">in</span> <span class="ow">in</span><span class="o">.</span><span class="n">script</span>
|
|
<span class="n">mpirun</span> <span class="o">-</span><span class="n">x</span> <span class="n">OMP_NUM_THREADS</span><span class="o">=</span><span class="mi">2</span> <span class="o">-</span><span class="n">np</span> <span class="mi">2</span> <span class="n">lmp_machine</span> <span class="o">-</span><span class="n">sf</span> <span class="n">omp</span> <span class="o">-</span><span class="ow">in</span> <span class="ow">in</span><span class="o">.</span><span class="n">script</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>or you can set it permanently in your shell’s start-up script.
|
|
All three of these examples use a total of 4 CPU cores.</p>
|
|
<p>Note that different MPI implementations have different ways of passing
|
|
the OMP_NUM_THREADS environment variable to all MPI processes. The
|
|
2nd example line above is for MPICH; the 3rd example line with -x is
|
|
for OpenMPI. Check your MPI documentation for additional details.</p>
|
|
<p>What combination of threads and MPI tasks gives the best performance
|
|
is difficult to predict and can depend on many components of your
|
|
input. Not all features of LAMMPS support OpenMP threading via the
|
|
USER-OMP packaage and the parallel efficiency can be very different,
|
|
too.</p>
|
|
<p>Optional keyword/value pairs can also be specified. Each has a
|
|
default value as listed below.</p>
|
|
<p>The <em>neigh</em> keyword specifies whether neighbor list building will be
|
|
multi-threaded in addition to force calculations. If <em>neigh</em> is set
|
|
to <em>no</em> then neighbor list calculation is performed only by MPI tasks
|
|
with no OpenMP threading. If <em>mode</em> is <em>yes</em> (the default), a
|
|
multi-threaded neighbor list build is used. Using <em>neigh</em> = <em>yes</em> is
|
|
almost always faster and should produce idential neighbor lists at the
|
|
expense of using more memory. Specifically, neighbor list pages are
|
|
allocated for all threads at the same time and each thread works
|
|
within its own pages.</p>
|
|
</div>
|
|
<hr class="docutils" />
|
|
<div class="section" id="restrictions">
|
|
<h2>Restrictions</h2>
|
|
<p>This command cannot be used after the simulation box is defined by a
|
|
<a class="reference internal" href="read_data.html"><span class="doc">read_data</span></a> or <a class="reference internal" href="create_box.html"><span class="doc">create_box</span></a> command.</p>
|
|
<p>The gpu style of this command can only be invoked if LAMMPS was built
|
|
with the GPU package. See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
|
|
<p>The intel style of this command can only be invoked if LAMMPS was
|
|
built with the USER-INTEL package. See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
|
|
<p>The kk style of this command can only be invoked if LAMMPS was built
|
|
with the KOKKOS package. See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
|
|
<p>The omp style of this command can only be invoked if LAMMPS was built
|
|
with the USER-OMP package. See the <a class="reference internal" href="Section_start.html#start-3"><span class="std std-ref">Making LAMMPS</span></a> section for more info.</p>
|
|
</div>
|
|
<div class="section" id="related-commands">
|
|
<h2>Related commands</h2>
|
|
<p><a class="reference internal" href="suffix.html"><span class="doc">suffix</span></a>, “-pk” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line setting</span></a></p>
|
|
</div>
|
|
<div class="section" id="default">
|
|
<h2>Default</h2>
|
|
<p>For the GPU package, the default is Ngpu = 1 and the option defaults
|
|
are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
|
|
to Ngpu-1, tpa = 1, and device = not used. These settings are made
|
|
automatically if the “-sf gpu” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> is used. If it is not used, you
|
|
must invoke the package gpu command in your input script or via the
|
|
“-pk gpu” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>.</p>
|
|
<p>For the USER-INTEL package, the default is Nphi = 1 and the option
|
|
defaults are omp = 0, mode = mixed, lrt = no, balance = -1, tpc = 4,
|
|
tptask = 240. The default ghost option is determined by the pair
|
|
style being used. This value is output to the screen in the offload
|
|
report at the end of each run. Note that all of these settings,
|
|
except “omp” and “mode”, are ignored if LAMMPS was not built with
|
|
Xeon Phi coprocessor support. These settings are made automatically
|
|
if the “-sf intel” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>
|
|
is used. If it is not used, you must invoke the package intel
|
|
command in your input script or or via the “-pk intel” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>.</p>
|
|
<p>For the KOKKOS package, the option defaults neigh = full, newton =
|
|
off, binsize = 0.0, and comm = device. These settings are made
|
|
automatically by the required “-k on” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>. You can change them bu using the
|
|
package kokkos command in your input script or via the “-pk kokkos”
|
|
<a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>.</p>
|
|
<p>For the OMP package, the default is Nthreads = 0 and the option
|
|
defaults are neigh = yes. These settings are made automatically if
|
|
the “-sf omp” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a> is
|
|
used. If it is not used, you must invoke the package omp command in
|
|
your input script or via the “-pk omp” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>.</p>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
<footer>
|
|
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<p>
|
|
© Copyright 2013 Sandia Corporation.
|
|
</p>
|
|
</div>
|
|
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
|
|
|
</footer>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</section>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript">
|
|
var DOCUMENTATION_OPTIONS = {
|
|
URL_ROOT:'./',
|
|
VERSION:'',
|
|
COLLAPSE_INDEX:false,
|
|
FILE_SUFFIX:'.html',
|
|
HAS_SOURCE: true
|
|
};
|
|
</script>
|
|
<script type="text/javascript" src="_static/jquery.js"></script>
|
|
<script type="text/javascript" src="_static/underscore.js"></script>
|
|
<script type="text/javascript" src="_static/doctools.js"></script>
|
|
<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/jquery-1.11.0.min.js"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/lightbox.min.js"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2-customize/jquery-noconflict.js"></script>
|
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript" src="_static/js/theme.js"></script>
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript">
|
|
jQuery(function () {
|
|
SphinxRtdTheme.StickyNav.enable();
|
|
});
|
|
</script>
|
|
|
|
|
|
</body>
|
|
</html> |