forked from lijiext/lammps
Merge pull request #1430 from ndtrung81/gpu-neigh-hybrid
Enable neighbor build on the device for pair hybrid substyles
This commit is contained in:
commit
2f29bd29f4
|
@ -2202,10 +2202,6 @@ Self-explanatory. :dd
|
|||
|
||||
This is a current restriction in LAMMPS. :dd
|
||||
|
||||
{Cannot use pair hybrid with GPU neighbor list builds} :dt
|
||||
|
||||
Neighbor list builds must be done on the CPU for this pair style. :dd
|
||||
|
||||
{Cannot use pair tail corrections with 2d simulations} :dt
|
||||
|
||||
The correction factors are only currently defined for 3d systems. :dd
|
||||
|
@ -5523,10 +5519,6 @@ Self-explanatory. :dd
|
|||
For this pair style, you cannot run part of the force calculation on
|
||||
the host. See the package command. :dd
|
||||
|
||||
{GPU split param must be positive for hybrid pair styles} :dt
|
||||
|
||||
See the package gpu command. :dd
|
||||
|
||||
{GPUs are requested but Kokkos has not been compiled for CUDA} :dt
|
||||
|
||||
Re-compile Kokkos with CUDA support to use GPUs. :dd
|
||||
|
|
|
@ -104,7 +104,7 @@ code (with a performance penalty due to having data transfers between
|
|||
host and GPU). :ulb,l
|
||||
|
||||
The GPU package requires neighbor lists to be built on the CPU when using
|
||||
exclusion lists, hybrid pair styles, or a triclinic simulation box. :l
|
||||
exclusion lists, or a triclinic simulation box. :l
|
||||
|
||||
The GPU package can be compiled for CUDA or OpenCL and thus supports
|
||||
both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically
|
||||
|
|
|
@ -176,12 +176,10 @@ computation will be built. If {neigh} is {yes}, which is the default,
|
|||
neighbor list building is performed on the GPU. If {neigh} is {no},
|
||||
neighbor list building is performed on the CPU. GPU neighbor list
|
||||
building currently cannot be used with a triclinic box. GPU neighbor
|
||||
list calculation currently cannot be used with
|
||||
"hybrid"_pair_hybrid.html pair styles. GPU neighbor lists are not
|
||||
compatible with commands that are not GPU-enabled. When a non-GPU
|
||||
enabled command requires a neighbor list, it will also be built on the
|
||||
CPU. In these cases, it will typically be more efficient to only use
|
||||
CPU neighbor list builds.
|
||||
lists are not compatible with commands that are not GPU-enabled. When
|
||||
a non-GPU enabled command requires a neighbor list, it will also be
|
||||
built on the CPU. In these cases, it will typically be more efficient
|
||||
to only use CPU neighbor list builds.
|
||||
|
||||
The {newton} keyword sets the Newton flags for pairwise (not bonded)
|
||||
interactions to {off} or {on}, the same as the "newton"_newton.html
|
||||
|
|
|
@ -64,9 +64,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
|
|||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
|
|
@ -65,9 +65,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
|
|||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
|
|
@ -66,9 +66,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
|
|||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
|
|
@ -65,9 +65,13 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
|
|||
} else
|
||||
_nbor_data=&(nbor->dev_nbor);
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom,true);
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
|
|
@ -71,12 +71,15 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
|
|||
|
||||
_threads_per_atom=device->threads_per_atom();
|
||||
|
||||
int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,true,
|
||||
1);
|
||||
int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,true,1);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
|
|
|
@ -78,9 +78,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
|
|||
if (_threads_per_atom*_threads_per_atom>device->warp_size())
|
||||
return -10;
|
||||
|
||||
int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
|
||||
maxspecial,_gpu_host,max_nbors,cell_size,false,
|
||||
_threads_per_atom);
|
||||
int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
|
||||
max_nbors,cell_size,false,_threads_per_atom);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
|
|
|
@ -246,11 +246,8 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
|
|||
template <class numtyp, class acctyp>
|
||||
int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
||||
const bool rot, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
Neighbor *nbor, const int maxspecial,
|
||||
const int gpu_host, const int max_nbors,
|
||||
const double cell_size, const bool pre_cut,
|
||||
const int threads_per_atom, const bool vel) {
|
||||
const int nall, const int maxspecial,
|
||||
const bool vel) {
|
||||
if (!_device_init)
|
||||
return -1;
|
||||
if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
|
||||
|
@ -301,16 +298,6 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
|||
if (!ans.init(ef_nlocal,charge,rot,*gpu))
|
||||
return -3;
|
||||
|
||||
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
||||
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
|
||||
_block_cell_id, _block_nbor_build, threads_per_atom,
|
||||
_warp_size, _time_device, compile_string()))
|
||||
return -3;
|
||||
if (_cell_size<0.0)
|
||||
nbor->cell_size(cell_size,cell_size);
|
||||
else
|
||||
nbor->cell_size(_cell_size,cell_size);
|
||||
|
||||
_init_count++;
|
||||
return 0;
|
||||
}
|
||||
|
@ -338,6 +325,39 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
|
|||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
const int maxspecial, const int gpu_host,
|
||||
const int max_nbors, const double cell_size,
|
||||
const bool pre_cut, const int threads_per_atom) {
|
||||
int ef_nlocal=nlocal;
|
||||
if (_particle_split<1.0 && _particle_split>0.0)
|
||||
ef_nlocal=static_cast<int>(_particle_split*nlocal);
|
||||
|
||||
int gpu_nbor=0;
|
||||
if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
|
||||
gpu_nbor=1;
|
||||
else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
|
||||
gpu_nbor=2;
|
||||
#ifndef USE_CUDPP
|
||||
if (gpu_nbor==1)
|
||||
gpu_nbor=2;
|
||||
#endif
|
||||
|
||||
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
||||
*gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
|
||||
_block_cell_id, _block_nbor_build, threads_per_atom,
|
||||
_warp_size, _time_device, compile_string()))
|
||||
return -3;
|
||||
if (_cell_size<0.0)
|
||||
nbor->cell_size(cell_size,cell_size);
|
||||
else
|
||||
nbor->cell_size(_cell_size,cell_size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void DeviceT::set_single_precompute
|
||||
(PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
|
||||
|
@ -614,7 +634,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
|
|||
if (screen && times[6]>0.0) {
|
||||
fprintf(screen,"\n\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
fprintf(screen," Device Time Info (average): ");
|
||||
fprintf(screen," Device Time Info (average) for kspace: ");
|
||||
fprintf(screen,"\n-------------------------------------");
|
||||
fprintf(screen,"--------------------------------\n");
|
||||
|
||||
|
|
|
@ -53,11 +53,43 @@ class Device {
|
|||
const int t_per_atom, const double cell_size,
|
||||
char *vendor_string, const int block_pair);
|
||||
|
||||
/// Initialize the device for Atom and Neighbor storage
|
||||
/** \param rot True if quaternions need to be stored
|
||||
/// Initialize the device for Atom storage
|
||||
/** \param charge True if charges need to be stored
|
||||
* \param rot True if quaternions need to be stored
|
||||
* \param nlocal Total number of local particles to allocate memory for
|
||||
* \param nall Total number of local+ghost particles
|
||||
* \param maxspecial Maximum mumber of special bonded atoms per atom
|
||||
* \param vel True if velocities need to be stored
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
|
||||
const int nlocal, const int nall, const int maxspecial,
|
||||
const bool vel=false);
|
||||
|
||||
/// Initialize the device for Atom storage only
|
||||
/** \param nlocal Total number of local particles to allocate memory for
|
||||
* \param nall Total number of local+ghost particles
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
|
||||
|
||||
/// Initialize the neighbor list storage
|
||||
/** \param charge True if charges need to be stored
|
||||
* \param rot True if quaternions need to be stored
|
||||
* \param nlocal Total number of local particles to allocate memory for
|
||||
* \param host_nlocal Initial number of host particles to allocate memory for
|
||||
* \param nall Total number of local+ghost particles
|
||||
* \param maxspecial Maximum mumber of special bonded atoms per atom
|
||||
* \param gpu_host 0 if host will not perform force calculations,
|
||||
* 1 if gpu_nbor is true, and host needs a half nbor list,
|
||||
* 2 if gpu_nbor is true, and host needs a full nbor list
|
||||
|
@ -73,23 +105,11 @@ class Device {
|
|||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot,
|
||||
const int nlocal, const int host_nlocal, const int nall,
|
||||
Neighbor *nbor, const int maxspecial, const int gpu_host,
|
||||
const int max_nbors, const double cell_size, const bool pre_cut,
|
||||
const int threads_per_atom, const bool vel=false);
|
||||
|
||||
/// Initialize the device for Atom storage only
|
||||
/** \param nlocal Total number of local particles to allocate memory for
|
||||
* \param nall Total number of local+ghost particles
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
|
||||
int init_nbor(Neighbor *nbor, const int nlocal,
|
||||
const int host_nlocal, const int nall,
|
||||
const int maxspecial, const int gpu_host,
|
||||
const int max_nbors, const double cell_size,
|
||||
const bool pre_cut, const int threads_per_atom);
|
||||
|
||||
/// Output a message for pair_style acceleration with device stats
|
||||
void init_message(FILE *screen, const char *name,
|
||||
|
@ -173,7 +193,7 @@ class Device {
|
|||
/// Return host memory usage in bytes
|
||||
double host_memory_usage() const;
|
||||
|
||||
/// Return the number of procs sharing a device (size of device commincator)
|
||||
/// Return the number of procs sharing a device (size of device communicator)
|
||||
inline int procs_per_gpu() const { return _procs_per_gpu; }
|
||||
/// Return the number of threads per proc
|
||||
inline int num_threads() const { return _nthreads; }
|
||||
|
@ -260,12 +280,12 @@ class Device {
|
|||
/// Atom Data
|
||||
Atom<numtyp,acctyp> atom;
|
||||
|
||||
// --------------------------- NBOR DATA ----------------------------
|
||||
// --------------------------- NBOR SHARED KERNELS ----------------
|
||||
|
||||
/// Neighbor Data
|
||||
/// Shared kernels for neighbor lists
|
||||
NeighborShared _neighbor_shared;
|
||||
|
||||
// ------------------------ LONG RANGE DATA -------------------------
|
||||
// ------------------------ LONG RANGE DATA -----------------------
|
||||
|
||||
// Long Range Data
|
||||
int _long_range_precompute;
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
This package implements GPU optimizations of various LAMMPS styles.
|
||||
|
||||
Section 5.3.1 on the manual gives details of what hardware and Cuda
|
||||
Section 3.7 of the manual gives details of what hardware and Cuda
|
||||
software is required on your system, and full details on how to build
|
||||
and use this package. See the KOKKOS package, which also has
|
||||
GPU-enabled styles.
|
||||
and use this package. The KOKKOS package also has GPU-enabled styles.
|
||||
|
||||
This package uses an external library provided in lib/gpu which must
|
||||
be compiled before making LAMMPS. See the lib/gpu/README file and the
|
||||
|
|
|
@ -219,17 +219,6 @@ void FixGPU::init()
|
|||
error->all(FLERR,"GPU package does not (yet) work with "
|
||||
"atom_style template");
|
||||
|
||||
// hybrid cannot be used with force/neigh option
|
||||
|
||||
if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
|
||||
if (force->pair_match("^hybrid",0) != NULL)
|
||||
error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
|
||||
|
||||
if (_particle_split < 0)
|
||||
if (force->pair_match("^hybrid",0) != NULL)
|
||||
error->all(FLERR,"GPU split param must be positive "
|
||||
"for hybrid pair styles");
|
||||
|
||||
// neighbor list builds on the GPU with triclinic box is not yet supported
|
||||
|
||||
if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) &&
|
||||
|
|
|
@ -65,14 +65,6 @@ E: GPU package does not (yet) work with atom_style template
|
|||
|
||||
Self-explanatory.
|
||||
|
||||
E: Cannot use pair hybrid with GPU neighbor list builds
|
||||
|
||||
Neighbor list builds must be done on the CPU for this pair style.
|
||||
|
||||
E: GPU split param must be positive for hybrid pair styles
|
||||
|
||||
See the package gpu command.
|
||||
|
||||
E: Cannot use package gpu neigh yes with triclinic box
|
||||
|
||||
This is a current restriction in LAMMPS.
|
||||
|
|
Loading…
Reference in New Issue