diff --git a/doc/src/Errors_messages.txt b/doc/src/Errors_messages.txt index 7249bfddfd..b993564656 100644 --- a/doc/src/Errors_messages.txt +++ b/doc/src/Errors_messages.txt @@ -2202,10 +2202,6 @@ Self-explanatory. :dd This is a current restriction in LAMMPS. :dd -{Cannot use pair hybrid with GPU neighbor list builds} :dt - -Neighbor list builds must be done on the CPU for this pair style. :dd - {Cannot use pair tail corrections with 2d simulations} :dt The correction factors are only currently defined for 3d systems. :dd @@ -5523,10 +5519,6 @@ Self-explanatory. :dd For this pair style, you cannot run part of the force calculation on the host. See the package command. :dd -{GPU split param must be positive for hybrid pair styles} :dt - -See the package gpu command. :dd - {GPUs are requested but Kokkos has not been compiled for CUDA} :dt Re-compile Kokkos with CUDA support to use GPUs. :dd diff --git a/doc/src/Speed_compare.txt b/doc/src/Speed_compare.txt index c93407515e..c95af3cb22 100644 --- a/doc/src/Speed_compare.txt +++ b/doc/src/Speed_compare.txt @@ -104,7 +104,7 @@ code (with a performance penalty due to having data transfers between host and GPU). :ulb,l The GPU package requires neighbor lists to be built on the CPU when using -exclusion lists, hybrid pair styles, or a triclinic simulation box. :l +exclusion lists, or a triclinic simulation box. :l The GPU package can be compiled for CUDA or OpenCL and thus supports both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically diff --git a/doc/src/package.txt b/doc/src/package.txt index dfdecab0a5..6a6d17bcbc 100644 --- a/doc/src/package.txt +++ b/doc/src/package.txt @@ -176,12 +176,10 @@ computation will be built. If {neigh} is {yes}, which is the default, neighbor list building is performed on the GPU. If {neigh} is {no}, neighbor list building is performed on the CPU. GPU neighbor list building currently cannot be used with a triclinic box. GPU neighbor -list calculation currently cannot be used with -"hybrid"_pair_hybrid.html pair styles. GPU neighbor lists are not -compatible with commands that are not GPU-enabled. When a non-GPU -enabled command requires a neighbor list, it will also be built on the -CPU. In these cases, it will typically be more efficient to only use -CPU neighbor list builds. +lists are not compatible with commands that are not GPU-enabled. When +a non-GPU enabled command requires a neighbor list, it will also be +built on the CPU. In these cases, it will typically be more efficient +to only use CPU neighbor list builds. The {newton} keyword sets the Newton flags for pairwise (not bonded) interactions to {off} or {on}, the same as the "newton"_newton.html diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp index e59dae1a6f..da54f1dca3 100644 --- a/lib/gpu/lal_base_atomic.cpp +++ b/lib/gpu/lal_base_atomic.cpp @@ -64,9 +64,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall, } else _nbor_data=&(nbor->dev_nbor); - int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, - maxspecial,_gpu_host,max_nbors,cell_size,false, - _threads_per_atom); + int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + if (success!=0) + return success; + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); if (success!=0) return success; diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index c6341f7d57..a3ec710baa 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -65,9 +65,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, } else _nbor_data=&(nbor->dev_nbor); - int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor, - maxspecial,_gpu_host,max_nbors,cell_size,false, - _threads_per_atom); + int success=device->init(*ans,true,false,nlocal,nall,maxspecial); + if (success!=0) + return success; + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp index 478f0092c7..9fc7e1b235 100644 --- a/lib/gpu/lal_base_dipole.cpp +++ b/lib/gpu/lal_base_dipole.cpp @@ -66,9 +66,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall, } else _nbor_data=&(nbor->dev_nbor); - int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor, - maxspecial,_gpu_host,max_nbors,cell_size,false, - _threads_per_atom); + int success=device->init(*ans,true,true,nlocal,nall,maxspecial); + if (success!=0) + return success; + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index 941f463b14..eb5c2088a6 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -65,9 +65,13 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, } else _nbor_data=&(nbor->dev_nbor); - int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, - maxspecial,_gpu_host,max_nbors,cell_size,false, - _threads_per_atom,true); + int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true); + if (success!=0) + return success; + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) return success; diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 8918a3140c..eea5344e33 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -71,12 +71,15 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall, _threads_per_atom=device->threads_per_atom(); - int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor, - maxspecial,_gpu_host,max_nbors,cell_size,true, - 1); + int success=device->init(*ans,false,true,nlocal,nall,maxspecial); if (success!=0) return success; + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,true,1); + if (success!=0) + return success; + ucl_device=device->gpu; atom=&device->atom; diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp index aa77a48c66..0510b84d92 100644 --- a/lib/gpu/lal_base_three.cpp +++ b/lib/gpu/lal_base_three.cpp @@ -78,9 +78,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall, if (_threads_per_atom*_threads_per_atom>device->warp_size()) return -10; - int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor, - maxspecial,_gpu_host,max_nbors,cell_size,false, - _threads_per_atom); + int success=device->init(*ans,false,false,nlocal,nall,maxspecial); + if (success!=0) + return success; + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); if (success!=0) return success; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 9397f3c6c5..5534d32e5f 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -246,11 +246,8 @@ int DeviceT::set_ocl_params(char *ocl_vendor) { template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, - const int host_nlocal, const int nall, - Neighbor *nbor, const int maxspecial, - const int gpu_host, const int max_nbors, - const double cell_size, const bool pre_cut, - const int threads_per_atom, const bool vel) { + const int nall, const int maxspecial, + const bool vel) { if (!_device_init) return -1; if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false) @@ -301,16 +298,6 @@ int DeviceT::init(Answer &ans, const bool charge, if (!ans.init(ef_nlocal,charge,rot,*gpu)) return -3; - if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, - *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, - _block_cell_id, _block_nbor_build, threads_per_atom, - _warp_size, _time_device, compile_string())) - return -3; - if (_cell_size<0.0) - nbor->cell_size(cell_size,cell_size); - else - nbor->cell_size(_cell_size,cell_size); - _init_count++; return 0; } @@ -338,6 +325,39 @@ int DeviceT::init(Answer &ans, const int nlocal, return 0; } +template +int DeviceT::init_nbor(Neighbor *nbor, const int nlocal, + const int host_nlocal, const int nall, + const int maxspecial, const int gpu_host, + const int max_nbors, const double cell_size, + const bool pre_cut, const int threads_per_atom) { + int ef_nlocal=nlocal; + if (_particle_split<1.0 && _particle_split>0.0) + ef_nlocal=static_cast(_particle_split*nlocal); + + int gpu_nbor=0; + if (_gpu_mode==Device::GPU_NEIGH) + gpu_nbor=1; + else if (_gpu_mode==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + #ifndef USE_CUDPP + if (gpu_nbor==1) + gpu_nbor=2; + #endif + + if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial, + *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d, + _block_cell_id, _block_nbor_build, threads_per_atom, + _warp_size, _time_device, compile_string())) + return -3; + if (_cell_size<0.0) + nbor->cell_size(cell_size,cell_size); + else + nbor->cell_size(_cell_size,cell_size); + + return 0; +} + template void DeviceT::set_single_precompute (PPPM *pppm) { @@ -614,7 +634,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in, if (screen && times[6]>0.0) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - fprintf(screen," Device Time Info (average): "); + fprintf(screen," Device Time Info (average) for kspace: "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 695b0a62f9..0c4d5f8c43 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -53,11 +53,43 @@ class Device { const int t_per_atom, const double cell_size, char *vendor_string, const int block_pair); - /// Initialize the device for Atom and Neighbor storage - /** \param rot True if quaternions need to be stored + /// Initialize the device for Atom storage + /** \param charge True if charges need to be stored + * \param rot True if quaternions need to be stored + * \param nlocal Total number of local particles to allocate memory for + * \param nall Total number of local+ghost particles + * \param maxspecial Maximum mumber of special bonded atoms per atom + * \param vel True if velocities need to be stored + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(Answer &ans, const bool charge, const bool rot, + const int nlocal, const int nall, const int maxspecial, + const bool vel=false); + + /// Initialize the device for Atom storage only + /** \param nlocal Total number of local particles to allocate memory for + * \param nall Total number of local+ghost particles + * + * Returns: + * - 0 if successfull + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(Answer &ans, const int nlocal, const int nall); + + /// Initialize the neighbor list storage + /** \param charge True if charges need to be stored + * \param rot True if quaternions need to be stored * \param nlocal Total number of local particles to allocate memory for * \param host_nlocal Initial number of host particles to allocate memory for * \param nall Total number of local+ghost particles + * \param maxspecial Maximum mumber of special bonded atoms per atom * \param gpu_host 0 if host will not perform force calculations, * 1 if gpu_nbor is true, and host needs a half nbor list, * 2 if gpu_nbor is true, and host needs a full nbor list @@ -73,23 +105,11 @@ class Device { * - -3 if there is an out of memory error * - -4 if the GPU library was not compiled for GPU * - -5 Double precision is not supported on card **/ - int init(Answer &a, const bool charge, const bool rot, - const int nlocal, const int host_nlocal, const int nall, - Neighbor *nbor, const int maxspecial, const int gpu_host, - const int max_nbors, const double cell_size, const bool pre_cut, - const int threads_per_atom, const bool vel=false); - - /// Initialize the device for Atom storage only - /** \param nlocal Total number of local particles to allocate memory for - * \param nall Total number of local+ghost particles - * - * Returns: - * - 0 if successfull - * - -1 if fix gpu not found - * - -3 if there is an out of memory error - * - -4 if the GPU library was not compiled for GPU - * - -5 Double precision is not supported on card **/ - int init(Answer &ans, const int nlocal, const int nall); + int init_nbor(Neighbor *nbor, const int nlocal, + const int host_nlocal, const int nall, + const int maxspecial, const int gpu_host, + const int max_nbors, const double cell_size, + const bool pre_cut, const int threads_per_atom); /// Output a message for pair_style acceleration with device stats void init_message(FILE *screen, const char *name, @@ -173,7 +193,7 @@ class Device { /// Return host memory usage in bytes double host_memory_usage() const; - /// Return the number of procs sharing a device (size of device commincator) + /// Return the number of procs sharing a device (size of device communicator) inline int procs_per_gpu() const { return _procs_per_gpu; } /// Return the number of threads per proc inline int num_threads() const { return _nthreads; } @@ -260,12 +280,12 @@ class Device { /// Atom Data Atom atom; - // --------------------------- NBOR DATA ---------------------------- + // --------------------------- NBOR SHARED KERNELS ---------------- - /// Neighbor Data + /// Shared kernels for neighbor lists NeighborShared _neighbor_shared; - // ------------------------ LONG RANGE DATA ------------------------- + // ------------------------ LONG RANGE DATA ----------------------- // Long Range Data int _long_range_precompute; diff --git a/src/GPU/README b/src/GPU/README index 792fc1a8b9..bba532beaf 100644 --- a/src/GPU/README +++ b/src/GPU/README @@ -1,9 +1,8 @@ This package implements GPU optimizations of various LAMMPS styles. -Section 5.3.1 on the manual gives details of what hardware and Cuda +Section 3.7 of the manual gives details of what hardware and Cuda software is required on your system, and full details on how to build -and use this package. See the KOKKOS package, which also has -GPU-enabled styles. +and use this package. The KOKKOS package also has GPU-enabled styles. This package uses an external library provided in lib/gpu which must be compiled before making LAMMPS. See the lib/gpu/README file and the diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 0d5b4334c9..f0558e6a02 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -219,17 +219,6 @@ void FixGPU::init() error->all(FLERR,"GPU package does not (yet) work with " "atom_style template"); - // hybrid cannot be used with force/neigh option - - if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) - if (force->pair_match("^hybrid",0) != NULL) - error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds"); - - if (_particle_split < 0) - if (force->pair_match("^hybrid",0) != NULL) - error->all(FLERR,"GPU split param must be positive " - "for hybrid pair styles"); - // neighbor list builds on the GPU with triclinic box is not yet supported if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) && diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h index c190a91061..ba0b4c83cb 100644 --- a/src/GPU/fix_gpu.h +++ b/src/GPU/fix_gpu.h @@ -65,14 +65,6 @@ E: GPU package does not (yet) work with atom_style template Self-explanatory. -E: Cannot use pair hybrid with GPU neighbor list builds - -Neighbor list builds must be done on the CPU for this pair style. - -E: GPU split param must be positive for hybrid pair styles - -See the package gpu command. - E: Cannot use package gpu neigh yes with triclinic box This is a current restriction in LAMMPS.