Merge pull request #1430 from ndtrung81/gpu-neigh-hybrid

Enable neighbor build on the device for pair hybrid substyles
2019-06-17 14:12:40 -04:00 · 2019-06-17 14:12:40 -04:00 · 2f29bd29f4
parent daa53e3008 dc8b43a95f
commit 2f29bd29f4
14 changed files with 123 additions and 94 deletions
--- a/doc/src/Errors_messages.txt
+++ b/doc/src/Errors_messages.txt
@ -2202,10 +2202,6 @@ Self-explanatory. :dd

 This is a current restriction in LAMMPS. :dd

-{Cannot use pair hybrid with GPU neighbor list builds} :dt
-
-Neighbor list builds must be done on the CPU for this pair style. :dd
-
 {Cannot use pair tail corrections with 2d simulations} :dt

 The correction factors are only currently defined for 3d systems. :dd
@ -5523,10 +5519,6 @@ Self-explanatory. :dd
 For this pair style, you cannot run part of the force calculation on
 the host.  See the package command. :dd

-{GPU split param must be positive for hybrid pair styles} :dt
-
-See the package gpu command. :dd
-
 {GPUs are requested but Kokkos has not been compiled for CUDA} :dt

 Re-compile Kokkos with CUDA support to use GPUs. :dd
--- a/doc/src/Speed_compare.txt
+++ b/doc/src/Speed_compare.txt
@ -104,7 +104,7 @@ code (with a performance penalty due to having data transfers between
 host and GPU). :ulb,l

 The GPU package requires neighbor lists to be built on the CPU when using
-exclusion lists, hybrid pair styles, or a triclinic simulation box. :l
+exclusion lists, or a triclinic simulation box. :l

 The GPU package can be compiled for CUDA or OpenCL and thus supports
 both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -176,12 +176,10 @@ computation will be built.  If {neigh} is {yes}, which is the default,
 neighbor list building is performed on the GPU.  If {neigh} is {no},
 neighbor list building is performed on the CPU.  GPU neighbor list
 building currently cannot be used with a triclinic box.  GPU neighbor
-list calculation currently cannot be used with
-"hybrid"_pair_hybrid.html pair styles.  GPU neighbor lists are not
-compatible with commands that are not GPU-enabled.  When a non-GPU
-enabled command requires a neighbor list, it will also be built on the
-CPU.  In these cases, it will typically be more efficient to only use
-CPU neighbor list builds.
+lists are not compatible with commands that are not GPU-enabled.  When
+a non-GPU enabled command requires a neighbor list, it will also be
+built on the CPU.  In these cases, it will typically be more efficient
+to only use CPU neighbor list builds.

 The {newton} keyword sets the Newton flags for pairwise (not bonded)
 interactions to {off} or {on}, the same as the "newton"_newton.html
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -64,9 +64,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -65,9 +65,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -66,9 +66,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -65,9 +65,13 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom,true);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -71,12 +71,15 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,true,
-                           1);
+  int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,true,1);
+  if (success!=0)
+    return success;
+  
  ucl_device=device->gpu;
  atom=&device->atom;

--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -78,9 +78,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -246,11 +246,8 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                  const bool rot, const int nlocal,
-                  const int host_nlocal, const int nall,
-                  Neighbor *nbor, const int maxspecial,
-                  const int gpu_host, const int max_nbors,
-                  const double cell_size, const bool pre_cut,
-                  const int threads_per_atom, const bool vel) {
+                  const int nall, const int maxspecial,
+                  const bool vel) {
  if (!_device_init)
    return -1;
  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@ -301,16 +298,6 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
  if (!ans.init(ef_nlocal,charge,rot,*gpu))
    return -3;

-  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
-                  _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device, compile_string()))
-    return -3;
-  if (_cell_size<0.0)
-    nbor->cell_size(cell_size,cell_size);
-  else
-    nbor->cell_size(_cell_size,cell_size);
-
  _init_count++;
  return 0;
 }
@ -338,6 +325,39 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
  return 0;
 }

+template <class numtyp, class acctyp>
+int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
+                  const int host_nlocal, const int nall,
+                  const int maxspecial, const int gpu_host,
+                  const int max_nbors, const double cell_size,
+                  const bool pre_cut, const int threads_per_atom) {
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+ 
+  int gpu_nbor=0;
+  if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+  #ifndef USE_CUDPP
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
+
+  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
+                  _block_cell_id, _block_nbor_build, threads_per_atom,
+                  _warp_size, _time_device, compile_string()))
+    return -3;
+  if (_cell_size<0.0)
+    nbor->cell_size(cell_size,cell_size);
+  else
+    nbor->cell_size(_cell_size,cell_size);
+
+  return 0;
+}
+
 template <class numtyp, class acctyp>
 void DeviceT::set_single_precompute
                     (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
@ -614,7 +634,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
    if (screen && times[6]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"    Device Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average) for kspace: ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -53,11 +53,43 @@ class Device {
                  const int t_per_atom, const double cell_size,
                  char *vendor_string, const int block_pair);

-  /// Initialize the device for Atom and Neighbor storage
-  /** \param rot True if quaternions need to be stored
+  /// Initialize the device for Atom storage
+  /** \param charge True if charges need to be stored 
+    * \param rot True if quaternions need to be stored
+    * \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
+    * \param vel True if velocities need to be stored
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
+           const int nlocal, const int nall, const int maxspecial,
+           const bool vel=false);
+
+  /// Initialize the device for Atom storage only
+  /** \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
+
+  /// Initialize the neighbor list storage
+  /** \param charge True if charges need to be stored
+    * \param rot True if quaternions need to be stored
    * \param nlocal Total number of local particles to allocate memory for
    * \param host_nlocal Initial number of host particles to allocate memory for
    * \param nall Total number of local+ghost particles
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
    * \param gpu_host 0 if host will not perform force calculations,
    *                 1 if gpu_nbor is true, and host needs a half nbor list,
    *                 2 if gpu_nbor is true, and host needs a full nbor list
@ -73,23 +105,11 @@ class Device {
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
-  int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot,
-           const int nlocal, const int host_nlocal, const int nall,
-           Neighbor *nbor, const int maxspecial, const int gpu_host,
-           const int max_nbors, const double cell_size, const bool pre_cut,
-           const int threads_per_atom, const bool vel=false);
-
-  /// Initialize the device for Atom storage only
-  /** \param nlocal Total number of local particles to allocate memory for
-    * \param nall Total number of local+ghost particles
-    *
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
+  int init_nbor(Neighbor *nbor, const int nlocal,
+                const int host_nlocal, const int nall,
+                const int maxspecial, const int gpu_host,
+                const int max_nbors, const double cell_size,
+                const bool pre_cut, const int threads_per_atom);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
@ -173,7 +193,7 @@ class Device {
  /// Return host memory usage in bytes
  double host_memory_usage() const;

-  /// Return the number of procs sharing a device (size of device commincator)
+  /// Return the number of procs sharing a device (size of device communicator)
  inline int procs_per_gpu() const { return _procs_per_gpu; }
  /// Return the number of threads per proc
  inline int num_threads() const { return _nthreads; }
@ -260,12 +280,12 @@ class Device {
  /// Atom Data
  Atom<numtyp,acctyp> atom;

-  // --------------------------- NBOR DATA ----------------------------
+  // --------------------------- NBOR SHARED KERNELS ----------------

-  /// Neighbor Data
+  /// Shared kernels for neighbor lists
  NeighborShared _neighbor_shared;

-  // ------------------------ LONG RANGE DATA -------------------------
+  // ------------------------ LONG RANGE DATA -----------------------

  // Long Range Data
  int _long_range_precompute;
--- a/src/GPU/README
+++ b/src/GPU/README
@ -1,9 +1,8 @@
 This package implements GPU optimizations of various LAMMPS styles.

-Section 5.3.1 on the manual gives details of what hardware and Cuda
+Section 3.7 of the manual gives details of what hardware and Cuda
 software is required on your system, and full details on how to build
-and use this package.  See the KOKKOS package, which also has
-GPU-enabled styles.
+and use this package.  The KOKKOS package also has GPU-enabled styles.

 This package uses an external library provided in lib/gpu which must
 be compiled before making LAMMPS.  See the lib/gpu/README file and the
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -219,17 +219,6 @@ void FixGPU::init()
    error->all(FLERR,"GPU package does not (yet) work with "
               "atom_style template");

-  // hybrid cannot be used with force/neigh option
-
-  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
-
-  if (_particle_split < 0)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"GPU split param must be positive "
-                 "for hybrid pair styles");
-
  // neighbor list builds on the GPU with triclinic box is not yet supported

  if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) &&
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@ -65,14 +65,6 @@ E: GPU package does not (yet) work with atom_style template

 Self-explanatory.

-E: Cannot use pair hybrid with GPU neighbor list builds
-
-Neighbor list builds must be done on the CPU for this pair style.
-
-E: GPU split param must be positive for hybrid pair styles
-
-See the package gpu command.
-
 E: Cannot use package gpu neigh yes with triclinic box

 This is a current restriction in LAMMPS.