forked from lijiext/lammps
337 lines
18 KiB
HTML
337 lines
18 KiB
HTML
|
|
|
|
<!DOCTYPE html>
|
|
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
|
|
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
|
|
<head>
|
|
<meta charset="utf-8">
|
|
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
|
<title>5.USER-OMP package — LAMMPS documentation</title>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
|
|
|
|
|
|
|
|
<link rel="stylesheet" href="_static/sphinxcontrib-images/LightBox2/lightbox2/css/lightbox.css" type="text/css" />
|
|
|
|
|
|
|
|
<link rel="top" title="LAMMPS documentation" href="index.html"/>
|
|
|
|
|
|
<script src="_static/js/modernizr.min.js"></script>
|
|
|
|
</head>
|
|
|
|
<body class="wy-body-for-nav" role="document">
|
|
|
|
<div class="wy-grid-for-nav">
|
|
|
|
|
|
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
|
<div class="wy-side-nav-search">
|
|
|
|
|
|
|
|
<a href="Manual.html" class="icon icon-home"> LAMMPS
|
|
|
|
|
|
|
|
</a>
|
|
|
|
|
|
<div role="search">
|
|
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
|
|
<input type="text" name="q" placeholder="Search docs" />
|
|
<input type="hidden" name="check_keywords" value="yes" />
|
|
<input type="hidden" name="area" value="default" />
|
|
</form>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
|
|
<div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
|
|
|
|
|
|
|
|
<ul>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_intro.html">1. Introduction</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_start.html">2. Getting Started</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_commands.html">3. Commands</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_packages.html">4. Packages</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_accelerate.html">5. Accelerating LAMMPS performance</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_howto.html">6. How-to discussions</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_example.html">7. Example problems</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_perf.html">8. Performance & scalability</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_tools.html">9. Additional tools</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_modify.html">10. Modifying & extending LAMMPS</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_python.html">11. Python interface to LAMMPS</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_errors.html">12. Errors</a></li>
|
|
<li class="toctree-l1"><a class="reference internal" href="Section_history.html">13. Future and history</a></li>
|
|
</ul>
|
|
|
|
|
|
|
|
</div>
|
|
|
|
</nav>
|
|
|
|
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
|
|
|
|
|
|
<nav class="wy-nav-top" role="navigation" aria-label="top navigation">
|
|
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
|
<a href="Manual.html">LAMMPS</a>
|
|
</nav>
|
|
|
|
|
|
|
|
<div class="wy-nav-content">
|
|
<div class="rst-content">
|
|
<div role="navigation" aria-label="breadcrumbs navigation">
|
|
<ul class="wy-breadcrumbs">
|
|
<li><a href="Manual.html">Docs</a> »</li>
|
|
|
|
<li>5.USER-OMP package</li>
|
|
<li class="wy-breadcrumbs-aside">
|
|
|
|
|
|
<a href="http://lammps.sandia.gov">Website</a>
|
|
<a href="Section_commands.html#comm">Commands</a>
|
|
|
|
</li>
|
|
</ul>
|
|
<hr/>
|
|
|
|
</div>
|
|
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
|
<div itemprop="articleBody">
|
|
|
|
<p><a class="reference internal" href="Section_accelerate.html"><span class="doc">Return to Section accelerate overview</span></a></p>
|
|
<div class="section" id="user-omp-package">
|
|
<h1>5.USER-OMP package</h1>
|
|
<p>The USER-OMP package was developed by Axel Kohlmeyer at Temple
|
|
University. It provides multi-threaded versions of most pair styles,
|
|
nearly all bonded styles (bond, angle, dihedral, improper), several
|
|
Kspace styles, and a few fix styles. The package currently uses the
|
|
OpenMP interface for multi-threading.</p>
|
|
<p>Here is a quick overview of how to use the USER-OMP package, assuming
|
|
one or more 16-core nodes. More details follow.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">use</span> <span class="o">-</span><span class="n">fopenmp</span> <span class="k">with</span> <span class="n">CCFLAGS</span> <span class="ow">and</span> <span class="n">LINKFLAGS</span> <span class="ow">in</span> <span class="n">Makefile</span><span class="o">.</span><span class="n">machine</span>
|
|
<span class="n">make</span> <span class="n">yes</span><span class="o">-</span><span class="n">user</span><span class="o">-</span><span class="n">omp</span>
|
|
<span class="n">make</span> <span class="n">mpi</span> <span class="c1"># build with USER-OMP package, if settings added to Makefile.mpi</span>
|
|
<span class="n">make</span> <span class="n">omp</span> <span class="c1"># or Makefile.omp already has settings</span>
|
|
<span class="n">Make</span><span class="o">.</span><span class="n">py</span> <span class="o">-</span><span class="n">v</span> <span class="o">-</span><span class="n">p</span> <span class="n">omp</span> <span class="o">-</span><span class="n">o</span> <span class="n">mpi</span> <span class="o">-</span><span class="n">a</span> <span class="n">file</span> <span class="n">mpi</span> <span class="c1"># or one-line build via Make.py</span>
|
|
</pre></div>
|
|
</div>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">lmp_mpi</span> <span class="o">-</span><span class="n">sf</span> <span class="n">omp</span> <span class="o">-</span><span class="n">pk</span> <span class="n">omp</span> <span class="mi">16</span> <span class="o"><</span> <span class="ow">in</span><span class="o">.</span><span class="n">script</span> <span class="c1"># 1 MPI task, 16 threads</span>
|
|
<span class="n">mpirun</span> <span class="o">-</span><span class="n">np</span> <span class="mi">4</span> <span class="n">lmp_mpi</span> <span class="o">-</span><span class="n">sf</span> <span class="n">omp</span> <span class="o">-</span><span class="n">pk</span> <span class="n">omp</span> <span class="mi">4</span> <span class="o">-</span><span class="ow">in</span> <span class="ow">in</span><span class="o">.</span><span class="n">script</span> <span class="c1"># 4 MPI tasks, 4 threads/task</span>
|
|
<span class="n">mpirun</span> <span class="o">-</span><span class="n">np</span> <span class="mi">32</span> <span class="o">-</span><span class="n">ppn</span> <span class="mi">4</span> <span class="n">lmp_mpi</span> <span class="o">-</span><span class="n">sf</span> <span class="n">omp</span> <span class="o">-</span><span class="n">pk</span> <span class="n">omp</span> <span class="mi">4</span> <span class="o">-</span><span class="ow">in</span> <span class="ow">in</span><span class="o">.</span><span class="n">script</span> <span class="c1"># 8 nodes, 4 MPI tasks/node, 4 threads/task</span>
|
|
</pre></div>
|
|
</div>
|
|
<p><strong>Required hardware/software:</strong></p>
|
|
<p>Your compiler must support the OpenMP interface. You should have one
|
|
or more multi-core CPUs so that multiple threads can be launched by
|
|
each MPI task running on a CPU.</p>
|
|
<p><strong>Building LAMMPS with the USER-OMP package:</strong></p>
|
|
<p>The lines above illustrate how to include/build with the USER-OMP
|
|
package in two steps, using the “make” command. Or how to do it with
|
|
one command via the src/Make.py script, described in <a class="reference internal" href="Section_start.html#start-4"><span class="std std-ref">Section 2.4</span></a> of the manual. Type “Make.py -h” for
|
|
help.</p>
|
|
<p>Note that the CCFLAGS and LINKFLAGS settings in Makefile.machine must
|
|
include “-fopenmp”. Likewise, if you use an Intel compiler, the
|
|
CCFLAGS setting must include “-restrict”. The Make.py command will
|
|
add these automatically.</p>
|
|
<p><strong>Run with the USER-OMP package from the command line:</strong></p>
|
|
<p>The mpirun or mpiexec command sets the total number of MPI tasks used
|
|
by LAMMPS (one or multiple per compute node) and the number of MPI
|
|
tasks used per node. E.g. the mpirun command in MPICH does this via
|
|
its -np and -ppn switches. Ditto for OpenMPI via -np and -npernode.</p>
|
|
<p>You need to choose how many OpenMP threads per MPI task will be used
|
|
by the USER-OMP package. Note that the product of MPI tasks *
|
|
threads/task should not exceed the physical number of cores (on a
|
|
node), otherwise performance will suffer.</p>
|
|
<p>As in the lines above, use the “-sf omp” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>, which will automatically append
|
|
“omp” to styles that support it. The “-sf omp” switch also issues a
|
|
default <a class="reference internal" href="package.html"><span class="doc">package omp 0</span></a> command, which will set the
|
|
number of threads per MPI task via the OMP_NUM_THREADS environment
|
|
variable.</p>
|
|
<p>You can also use the “-pk omp Nt” <a class="reference internal" href="Section_start.html#start-7"><span class="std std-ref">command-line switch</span></a>, to explicitly set Nt = # of OpenMP
|
|
threads per MPI task to use, as well as additional options. Its
|
|
syntax is the same as the <a class="reference internal" href="package.html"><span class="doc">package omp</span></a> command whose doc
|
|
page gives details, including the default values used if it is not
|
|
specified. It also gives more details on how to set the number of
|
|
threads via the OMP_NUM_THREADS environment variable.</p>
|
|
<p><strong>Or run with the USER-OMP package by editing an input script:</strong></p>
|
|
<p>The discussion above for the mpirun/mpiexec command, MPI tasks/node,
|
|
and threads/MPI task is the same.</p>
|
|
<p>Use the <a class="reference internal" href="suffix.html"><span class="doc">suffix omp</span></a> command, or you can explicitly add an
|
|
“omp” suffix to individual styles in your input script, e.g.</p>
|
|
<div class="highlight-default"><div class="highlight"><pre><span></span><span class="n">pair_style</span> <span class="n">lj</span><span class="o">/</span><span class="n">cut</span><span class="o">/</span><span class="n">omp</span> <span class="mf">2.5</span>
|
|
</pre></div>
|
|
</div>
|
|
<p>You must also use the <a class="reference internal" href="package.html"><span class="doc">package omp</span></a> command to enable the
|
|
USER-OMP package. When you do this you also specify how many threads
|
|
per MPI task to use. The command doc page explains other options and
|
|
how to set the number of threads via the OMP_NUM_THREADS environment
|
|
variable.</p>
|
|
<p><strong>Speed-ups to expect:</strong></p>
|
|
<p>Depending on which styles are accelerated, you should look for a
|
|
reduction in the “Pair time”, “Bond time”, “KSpace time”, and “Loop
|
|
time” values printed at the end of a run.</p>
|
|
<p>You may see a small performance advantage (5 to 20%) when running a
|
|
USER-OMP style (in serial or parallel) with a single thread per MPI
|
|
task, versus running standard LAMMPS with its standard un-accelerated
|
|
styles (in serial or all-MPI parallelization with 1 task/core). This
|
|
is because many of the USER-OMP styles contain similar optimizations
|
|
to those used in the OPT package, described in <a class="reference internal" href="accelerate_opt.html"><span class="doc">Section accelerate 5.3.6</span></a>.</p>
|
|
<p>With multiple threads/task, the optimal choice of number of MPI
|
|
tasks/node and OpenMP threads/task can vary a lot and should always be
|
|
tested via benchmark runs for a specific simulation running on a
|
|
specific machine, paying attention to guidelines discussed in the next
|
|
sub-section.</p>
|
|
<p>A description of the multi-threading strategy used in the USER-OMP
|
|
package and some performance examples are <a class="reference external" href="http://sites.google.com/site/akohlmey/software/lammps-icms/lammps-icms-tms2011-talk.pdf?attredirects=0&d=1">presented here</a></p>
|
|
<p><strong>Guidelines for best performance:</strong></p>
|
|
<p>For many problems on current generation CPUs, running the USER-OMP
|
|
package with a single thread/task is faster than running with multiple
|
|
threads/task. This is because the MPI parallelization in LAMMPS is
|
|
often more efficient than multi-threading as implemented in the
|
|
USER-OMP package. The parallel efficiency (in a threaded sense) also
|
|
varies for different USER-OMP styles.</p>
|
|
<p>Using multiple threads/task can be more effective under the following
|
|
circumstances:</p>
|
|
<ul class="simple">
|
|
<li>Individual compute nodes have a significant number of CPU cores but
|
|
the CPU itself has limited memory bandwidth, e.g. for Intel Xeon 53xx
|
|
(Clovertown) and 54xx (Harpertown) quad-core processors. Running one
|
|
MPI task per CPU core will result in significant performance
|
|
degradation, so that running with 4 or even only 2 MPI tasks per node
|
|
is faster. Running in hybrid MPI+OpenMP mode will reduce the
|
|
inter-node communication bandwidth contention in the same way, but
|
|
offers an additional speedup by utilizing the otherwise idle CPU
|
|
cores.</li>
|
|
<li>The interconnect used for MPI communication does not provide
|
|
sufficient bandwidth for a large number of MPI tasks per node. For
|
|
example, this applies to running over gigabit ethernet or on Cray XT4
|
|
or XT5 series supercomputers. As in the aforementioned case, this
|
|
effect worsens when using an increasing number of nodes.</li>
|
|
<li>The system has a spatially inhomogeneous particle density which does
|
|
not map well to the <a class="reference internal" href="processors.html"><span class="doc">domain decomposition scheme</span></a> or
|
|
<a class="reference internal" href="balance.html"><span class="doc">load-balancing</span></a> options that LAMMPS provides. This is
|
|
because multi-threading achives parallelism over the number of
|
|
particles, not via their distribution in space.</li>
|
|
<li>A machine is being used in “capability mode”, i.e. near the point
|
|
where MPI parallelism is maxed out. For example, this can happen when
|
|
using the <a class="reference internal" href="kspace_style.html"><span class="doc">PPPM solver</span></a> for long-range
|
|
electrostatics on large numbers of nodes. The scaling of the KSpace
|
|
calculation (see the <a class="reference internal" href="kspace_style.html"><span class="doc">kspace_style</span></a> command) becomes
|
|
the performance-limiting factor. Using multi-threading allows less
|
|
MPI tasks to be invoked and can speed-up the long-range solver, while
|
|
increasing overall performance by parallelizing the pairwise and
|
|
bonded calculations via OpenMP. Likewise additional speedup can be
|
|
sometimes be achived by increasing the length of the Coulombic cutoff
|
|
and thus reducing the work done by the long-range solver. Using the
|
|
<a class="reference internal" href="run_style.html"><span class="doc">run_style verlet/split</span></a> command, which is compatible
|
|
with the USER-OMP package, is an alternative way to reduce the number
|
|
of MPI tasks assigned to the KSpace calculation.</li>
|
|
</ul>
|
|
<p>Additional performance tips are as follows:</p>
|
|
<ul class="simple">
|
|
<li>The best parallel efficiency from <em>omp</em> styles is typically achieved
|
|
when there is at least one MPI task per physical CPU chip, i.e. socket
|
|
or die.</li>
|
|
<li>It is usually most efficient to restrict threading to a single
|
|
socket, i.e. use one or more MPI task per socket.</li>
|
|
<li>NOTE: By default, several current MPI implementations use a processor
|
|
affinity setting that restricts each MPI task to a single CPU core.
|
|
Using multi-threading in this mode will force all threads to share the
|
|
one core and thus is likely to be counterproductive. Instead, binding
|
|
MPI tasks to a (multi-core) socket, should solve this issue.</li>
|
|
</ul>
|
|
<div class="section" id="restrictions">
|
|
<h2>Restrictions</h2>
|
|
<p>None.</p>
|
|
</div>
|
|
</div>
|
|
|
|
|
|
</div>
|
|
</div>
|
|
<footer>
|
|
|
|
|
|
<hr/>
|
|
|
|
<div role="contentinfo">
|
|
<p>
|
|
© Copyright 2013 Sandia Corporation.
|
|
</p>
|
|
</div>
|
|
Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
|
|
|
</footer>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
</section>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript">
|
|
var DOCUMENTATION_OPTIONS = {
|
|
URL_ROOT:'./',
|
|
VERSION:'',
|
|
COLLAPSE_INDEX:false,
|
|
FILE_SUFFIX:'.html',
|
|
HAS_SOURCE: true
|
|
};
|
|
</script>
|
|
<script type="text/javascript" src="_static/jquery.js"></script>
|
|
<script type="text/javascript" src="_static/underscore.js"></script>
|
|
<script type="text/javascript" src="_static/doctools.js"></script>
|
|
<script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/jquery-1.11.0.min.js"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2/js/lightbox.min.js"></script>
|
|
<script type="text/javascript" src="_static/sphinxcontrib-images/LightBox2/lightbox2-customize/jquery-noconflict.js"></script>
|
|
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript" src="_static/js/theme.js"></script>
|
|
|
|
|
|
|
|
|
|
<script type="text/javascript">
|
|
jQuery(function () {
|
|
SphinxRtdTheme.StickyNav.enable();
|
|
});
|
|
</script>
|
|
|
|
|
|
</body>
|
|
</html> |