Merge pull request #1430 from ndtrung81/gpu-neigh-hybrid

Enable neighbor build on the device for pair hybrid substyles
This commit is contained in:
Axel Kohlmeyer 2019-06-17 14:12:40 -04:00 committed by GitHub
commit 2f29bd29f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 123 additions and 94 deletions

View File

@ -2202,10 +2202,6 @@ Self-explanatory. :dd
This is a current restriction in LAMMPS. :dd
{Cannot use pair hybrid with GPU neighbor list builds} :dt
Neighbor list builds must be done on the CPU for this pair style. :dd
{Cannot use pair tail corrections with 2d simulations} :dt
The correction factors are only currently defined for 3d systems. :dd
@ -5523,10 +5519,6 @@ Self-explanatory. :dd
For this pair style, you cannot run part of the force calculation on
the host. See the package command. :dd
{GPU split param must be positive for hybrid pair styles} :dt
See the package gpu command. :dd
{GPUs are requested but Kokkos has not been compiled for CUDA} :dt
Re-compile Kokkos with CUDA support to use GPUs. :dd

View File

@ -104,7 +104,7 @@ code (with a performance penalty due to having data transfers between
host and GPU). :ulb,l
The GPU package requires neighbor lists to be built on the CPU when using
exclusion lists, hybrid pair styles, or a triclinic simulation box. :l
exclusion lists, or a triclinic simulation box. :l
The GPU package can be compiled for CUDA or OpenCL and thus supports
both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically

View File

@ -176,12 +176,10 @@ computation will be built. If {neigh} is {yes}, which is the default,
neighbor list building is performed on the GPU. If {neigh} is {no},
neighbor list building is performed on the CPU. GPU neighbor list
building currently cannot be used with a triclinic box. GPU neighbor
list calculation currently cannot be used with
"hybrid"_pair_hybrid.html pair styles. GPU neighbor lists are not
compatible with commands that are not GPU-enabled. When a non-GPU
enabled command requires a neighbor list, it will also be built on the
CPU. In these cases, it will typically be more efficient to only use
CPU neighbor list builds.
lists are not compatible with commands that are not GPU-enabled. When
a non-GPU enabled command requires a neighbor list, it will also be
built on the CPU. In these cases, it will typically be more efficient
to only use CPU neighbor list builds.
The {newton} keyword sets the Newton flags for pairwise (not bonded)
interactions to {off} or {on}, the same as the "newton"_newton.html

View File

@ -64,9 +64,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom);
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
if (success!=0)
return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;

View File

@ -65,9 +65,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom);
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
if (success!=0)
return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;

View File

@ -66,9 +66,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom);
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
if (success!=0)
return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;

View File

@ -65,9 +65,13 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
} else
_nbor_data=&(nbor->dev_nbor);
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom,true);
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
if (success!=0)
return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;

View File

@ -71,12 +71,15 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
_threads_per_atom=device->threads_per_atom();
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,true,
1);
int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
if (success!=0)
return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,true,1);
if (success!=0)
return success;
ucl_device=device->gpu;
atom=&device->atom;

View File

@ -78,9 +78,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
if (_threads_per_atom*_threads_per_atom>device->warp_size())
return -10;
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
maxspecial,_gpu_host,max_nbors,cell_size,false,
_threads_per_atom);
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
if (success!=0)
return success;
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
max_nbors,cell_size,false,_threads_per_atom);
if (success!=0)
return success;

View File

@ -246,11 +246,8 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
template <class numtyp, class acctyp>
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
const bool rot, const int nlocal,
const int host_nlocal, const int nall,
Neighbor *nbor, const int maxspecial,
const int gpu_host, const int max_nbors,
const double cell_size, const bool pre_cut,
const int threads_per_atom, const bool vel) {
const int nall, const int maxspecial,
const bool vel) {
if (!_device_init)
return -1;
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@ -301,16 +298,6 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (!ans.init(ef_nlocal,charge,rot,*gpu))
return -3;
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
_block_cell_id, _block_nbor_build, threads_per_atom,
_warp_size, _time_device, compile_string()))
return -3;
if (_cell_size<0.0)
nbor->cell_size(cell_size,cell_size);
else
nbor->cell_size(_cell_size,cell_size);
_init_count++;
return 0;
}
@ -338,6 +325,39 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
return 0;
}
template <class numtyp, class acctyp>
int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
const int host_nlocal, const int nall,
const int maxspecial, const int gpu_host,
const int max_nbors, const double cell_size,
const bool pre_cut, const int threads_per_atom) {
int ef_nlocal=nlocal;
if (_particle_split<1.0 && _particle_split>0.0)
ef_nlocal=static_cast<int>(_particle_split*nlocal);
int gpu_nbor=0;
if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
gpu_nbor=1;
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
gpu_nbor=2;
#ifndef USE_CUDPP
if (gpu_nbor==1)
gpu_nbor=2;
#endif
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
_block_cell_id, _block_nbor_build, threads_per_atom,
_warp_size, _time_device, compile_string()))
return -3;
if (_cell_size<0.0)
nbor->cell_size(cell_size,cell_size);
else
nbor->cell_size(_cell_size,cell_size);
return 0;
}
template <class numtyp, class acctyp>
void DeviceT::set_single_precompute
(PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
@ -614,7 +634,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
if (screen && times[6]>0.0) {
fprintf(screen,"\n\n-------------------------------------");
fprintf(screen,"--------------------------------\n");
fprintf(screen," Device Time Info (average): ");
fprintf(screen," Device Time Info (average) for kspace: ");
fprintf(screen,"\n-------------------------------------");
fprintf(screen,"--------------------------------\n");

View File

@ -53,11 +53,43 @@ class Device {
const int t_per_atom, const double cell_size,
char *vendor_string, const int block_pair);
/// Initialize the device for Atom and Neighbor storage
/** \param rot True if quaternions need to be stored
/// Initialize the device for Atom storage
/** \param charge True if charges need to be stored
* \param rot True if quaternions need to be stored
* \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param vel True if velocities need to be stored
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
const int nlocal, const int nall, const int maxspecial,
const bool vel=false);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
/// Initialize the neighbor list storage
/** \param charge True if charges need to be stored
* \param rot True if quaternions need to be stored
* \param nlocal Total number of local particles to allocate memory for
* \param host_nlocal Initial number of host particles to allocate memory for
* \param nall Total number of local+ghost particles
* \param maxspecial Maximum mumber of special bonded atoms per atom
* \param gpu_host 0 if host will not perform force calculations,
* 1 if gpu_nbor is true, and host needs a half nbor list,
* 2 if gpu_nbor is true, and host needs a full nbor list
@ -73,23 +105,11 @@ class Device {
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot,
const int nlocal, const int host_nlocal, const int nall,
Neighbor *nbor, const int maxspecial, const int gpu_host,
const int max_nbors, const double cell_size, const bool pre_cut,
const int threads_per_atom, const bool vel=false);
/// Initialize the device for Atom storage only
/** \param nlocal Total number of local particles to allocate memory for
* \param nall Total number of local+ghost particles
*
* Returns:
* - 0 if successfull
* - -1 if fix gpu not found
* - -3 if there is an out of memory error
* - -4 if the GPU library was not compiled for GPU
* - -5 Double precision is not supported on card **/
int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
int init_nbor(Neighbor *nbor, const int nlocal,
const int host_nlocal, const int nall,
const int maxspecial, const int gpu_host,
const int max_nbors, const double cell_size,
const bool pre_cut, const int threads_per_atom);
/// Output a message for pair_style acceleration with device stats
void init_message(FILE *screen, const char *name,
@ -173,7 +193,7 @@ class Device {
/// Return host memory usage in bytes
double host_memory_usage() const;
/// Return the number of procs sharing a device (size of device commincator)
/// Return the number of procs sharing a device (size of device communicator)
inline int procs_per_gpu() const { return _procs_per_gpu; }
/// Return the number of threads per proc
inline int num_threads() const { return _nthreads; }
@ -260,12 +280,12 @@ class Device {
/// Atom Data
Atom<numtyp,acctyp> atom;
// --------------------------- NBOR DATA ----------------------------
// --------------------------- NBOR SHARED KERNELS ----------------
/// Neighbor Data
/// Shared kernels for neighbor lists
NeighborShared _neighbor_shared;
// ------------------------ LONG RANGE DATA -------------------------
// ------------------------ LONG RANGE DATA -----------------------
// Long Range Data
int _long_range_precompute;

View File

@ -1,9 +1,8 @@
This package implements GPU optimizations of various LAMMPS styles.
Section 5.3.1 on the manual gives details of what hardware and Cuda
Section 3.7 of the manual gives details of what hardware and Cuda
software is required on your system, and full details on how to build
and use this package. See the KOKKOS package, which also has
GPU-enabled styles.
and use this package. The KOKKOS package also has GPU-enabled styles.
This package uses an external library provided in lib/gpu which must
be compiled before making LAMMPS. See the lib/gpu/README file and the

View File

@ -219,17 +219,6 @@ void FixGPU::init()
error->all(FLERR,"GPU package does not (yet) work with "
"atom_style template");
// hybrid cannot be used with force/neigh option
if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
if (force->pair_match("^hybrid",0) != NULL)
error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
if (_particle_split < 0)
if (force->pair_match("^hybrid",0) != NULL)
error->all(FLERR,"GPU split param must be positive "
"for hybrid pair styles");
// neighbor list builds on the GPU with triclinic box is not yet supported
if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) &&

View File

@ -65,14 +65,6 @@ E: GPU package does not (yet) work with atom_style template
Self-explanatory.
E: Cannot use pair hybrid with GPU neighbor list builds
Neighbor list builds must be done on the CPU for this pair style.
E: GPU split param must be positive for hybrid pair styles
See the package gpu command.
E: Cannot use package gpu neigh yes with triclinic box
This is a current restriction in LAMMPS.