forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7989 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
f5d54c5ac1
commit
f3905bf229
|
@ -77,6 +77,14 @@ class UCL_Device {
|
|||
/// Returns the stream indexed by i
|
||||
inline command_queue & cq(const int i) { return _cq[i]; }
|
||||
|
||||
/// Set the default command queue (by default this is the null stream)
|
||||
/** \param i index of the command queue (as added by push_command_queue())
|
||||
If i is 0, the default command queue is set to the null stream **/
|
||||
inline void set_command_queue(const int i) {
|
||||
if (i==0) _cq[0]=0;
|
||||
else _cq[0]=_cq[i];
|
||||
}
|
||||
|
||||
/// Block until all commands in the default stream have completed
|
||||
inline void sync() { sync(0); }
|
||||
|
||||
|
@ -127,7 +135,8 @@ class UCL_Device {
|
|||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||
else return _properties[i].multiProcessorCount*32; }
|
||||
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
|
||||
else return _properties[i].multiProcessorCount*192; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
|
@ -205,6 +214,7 @@ inline int UCL_Device::set(int num) {
|
|||
if (_device==num)
|
||||
return UCL_SUCCESS;
|
||||
for (int i=1; i<num_queues(); i++) pop_command_queue();
|
||||
_cq[0]=0;
|
||||
cudaThreadExit();
|
||||
cudaError err=cudaSetDevice(_device_ids[num]);
|
||||
if (err!=cudaSuccess) {
|
||||
|
|
|
@ -151,7 +151,8 @@ class UCL_Device {
|
|||
/// Get the number of cores
|
||||
inline unsigned cores(const int i)
|
||||
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
|
||||
else return _properties[i].multiProcessorCount*32; }
|
||||
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
|
||||
else return _properties[i].multiProcessorCount*192; }
|
||||
|
||||
/// Get the gigabytes of global memory in the current device
|
||||
inline double gigabytes() { return gigabytes(_device); }
|
||||
|
|
|
@ -124,7 +124,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
|
|||
if (my_gpu>=gpu->num_devices())
|
||||
return -2;
|
||||
|
||||
#ifndef CUDA_PRX
|
||||
#ifndef CUDA_PROXY
|
||||
if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
|
||||
return -7;
|
||||
#endif
|
||||
|
@ -203,7 +203,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
|
|||
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
|
||||
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
|
||||
_block_cell_id, _block_nbor_build, threads_per_atom,
|
||||
_time_device))
|
||||
_warp_size, _time_device))
|
||||
return -3;
|
||||
nbor->cell_size(cell_size);
|
||||
|
||||
|
|
|
@ -37,13 +37,16 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
|
|||
const int gpu_nbor, const int gpu_host,
|
||||
const bool pre_cut, const int block_cell_2d,
|
||||
const int block_cell_id, const int block_nbor_build,
|
||||
const int threads_per_atom, const bool time_device) {
|
||||
const int threads_per_atom, const int warp_size,
|
||||
const bool time_device) {
|
||||
clear();
|
||||
|
||||
_threads_per_atom=threads_per_atom;
|
||||
_block_cell_2d=block_cell_2d;
|
||||
_block_cell_id=block_cell_id;
|
||||
_max_block_nbor_build=block_nbor_build;
|
||||
_block_nbor_build=block_nbor_build;
|
||||
_warp_size=warp_size;
|
||||
_shared=shared;
|
||||
dev=&devi;
|
||||
_gpu_nbor=gpu_nbor;
|
||||
|
@ -418,6 +421,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||
for (int i=0; i<_ncells; i++)
|
||||
mn=std::max(mn,host_cell_counts[i]);
|
||||
mn*=8;
|
||||
set_nbor_block_size(mn/2);
|
||||
|
||||
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||
if (!success)
|
||||
return;
|
||||
|
@ -497,6 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
|
|||
mn=host_acc[0];
|
||||
for (int i=1; i<nt; i++)
|
||||
mn=std::max(mn,host_acc[i]);
|
||||
set_nbor_block_size(mn);
|
||||
|
||||
if (mn>_max_nbors) {
|
||||
resize_max_neighbors<numtyp,acctyp>(mn,success);
|
||||
|
|
|
@ -67,7 +67,7 @@ class Neighbor {
|
|||
const int gpu_nbor, const int gpu_host, const bool pre_cut,
|
||||
const int block_cell_2d, const int block_cell_id,
|
||||
const int block_nbor_build, const int threads_per_atom,
|
||||
const bool time_device);
|
||||
const int warp_size, const bool time_device);
|
||||
|
||||
/// Set the size of the cutoff+skin
|
||||
inline void cell_size(const double size) { _cell_size=size; }
|
||||
|
@ -237,12 +237,20 @@ class Neighbor {
|
|||
double _gpu_bytes, _c_bytes, _cell_bytes;
|
||||
void alloc(bool &success);
|
||||
|
||||
int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
|
||||
int _threads_per_atom;
|
||||
int _total_atoms;
|
||||
int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
|
||||
int _ncells, _threads_per_atom, _total_atoms;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
inline void resize_max_neighbors(const int maxn, bool &success);
|
||||
|
||||
int _warp_size;
|
||||
inline void set_nbor_block_size(const int mn) {
|
||||
int desired=mn/(2*_warp_size);
|
||||
desired*=_warp_size;
|
||||
if (desired<_warp_size) desired=_warp_size;
|
||||
else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
|
||||
_block_nbor_build=desired;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -119,6 +119,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
|
|||
int ix = BLOCK_ID_X;
|
||||
int iy = BLOCK_ID_Y % ncelly;
|
||||
int iz = BLOCK_ID_Y / ncelly;
|
||||
int bsx = BLOCK_SIZE_X;
|
||||
|
||||
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
|
||||
|
||||
|
@ -134,9 +135,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
|
|||
|
||||
numtyp4 diff;
|
||||
numtyp r2;
|
||||
int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/BLOCK_SIZE_X);
|
||||
int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/bsx);
|
||||
for (int ii = 0; ii < cap; ii++) {
|
||||
int i = icell_begin + tid + ii*BLOCK_SIZE_X;
|
||||
int i = icell_begin + tid + ii*bsx;
|
||||
int pid_i = nall, pid_j, stride;
|
||||
numtyp4 atom_i, atom_j;
|
||||
int cnt = 0;
|
||||
|
@ -173,14 +174,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
|
|||
int num_atom_cell = jcell_end - jcell_begin;
|
||||
|
||||
// load jcell to shared memory
|
||||
int num_iter = ucl_ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
|
||||
int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx);
|
||||
|
||||
for (int k = 0; k < num_iter; k++) {
|
||||
int end_idx = min(BLOCK_NBOR_BUILD,
|
||||
num_atom_cell-k*BLOCK_NBOR_BUILD);
|
||||
int end_idx = min(bsx, num_atom_cell-k*bsx);
|
||||
|
||||
if (tid < end_idx) {
|
||||
pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
|
||||
pid_j = cell_particle_id[tid+k*bsx+jcell_begin];
|
||||
cell_list_sh[tid] = pid_j;
|
||||
atom_j = fetch_pos(pid_j,x_); //[pid_j];
|
||||
pos_sh[tid].x = atom_j.x;
|
||||
|
|
|
@ -100,6 +100,8 @@
|
|||
|
||||
#else
|
||||
|
||||
#if (ARCH < 300)
|
||||
|
||||
#define THREADS_PER_ATOM 4
|
||||
#define THREADS_PER_CHARGE 8
|
||||
#define BLOCK_NBOR_BUILD 128
|
||||
|
@ -107,6 +109,17 @@
|
|||
#define BLOCK_BIO_PAIR 128
|
||||
#define MAX_SHARED_TYPES 11
|
||||
|
||||
#else
|
||||
|
||||
#define THREADS_PER_ATOM 4
|
||||
#define THREADS_PER_CHARGE 8
|
||||
#define BLOCK_NBOR_BUILD 128
|
||||
#define BLOCK_PAIR 512
|
||||
#define BLOCK_BIO_PAIR 512
|
||||
#define MAX_SHARED_TYPES 11
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
|
Loading…
Reference in New Issue