git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7989 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2012-03-28 22:10:48 +00:00
parent f5d54c5ac1
commit f3905bf229
7 changed files with 53 additions and 15 deletions

View File

@ -77,6 +77,14 @@ class UCL_Device {
/// Returns the stream indexed by i
inline command_queue & cq(const int i) { return _cq[i]; }
/// Set the default command queue (by default this is the null stream)
/** \param i index of the command queue (as added by push_command_queue())
If i is 0, the default command queue is set to the null stream **/
inline void set_command_queue(const int i) {
if (i==0) _cq[0]=0;
else _cq[0]=_cq[i];
}
/// Block until all commands in the default stream have completed
inline void sync() { sync(0); }
@ -127,7 +135,8 @@ class UCL_Device {
/// Get the number of cores
inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else return _properties[i].multiProcessorCount*32; }
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
else return _properties[i].multiProcessorCount*192; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }
@ -205,6 +214,7 @@ inline int UCL_Device::set(int num) {
if (_device==num)
return UCL_SUCCESS;
for (int i=1; i<num_queues(); i++) pop_command_queue();
_cq[0]=0;
cudaThreadExit();
cudaError err=cudaSetDevice(_device_ids[num]);
if (err!=cudaSuccess) {

View File

@ -151,7 +151,8 @@ class UCL_Device {
/// Get the number of cores
inline unsigned cores(const int i)
{ if (arch(i)<2.0) return _properties[i].multiProcessorCount*8;
else return _properties[i].multiProcessorCount*32; }
else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
else return _properties[i].multiProcessorCount*192; }
/// Get the gigabytes of global memory in the current device
inline double gigabytes() { return gigabytes(_device); }

View File

@ -124,7 +124,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
if (my_gpu>=gpu->num_devices())
return -2;
#ifndef CUDA_PRX
#ifndef CUDA_PROXY
if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
return -7;
#endif
@ -203,7 +203,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
*gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
_block_cell_id, _block_nbor_build, threads_per_atom,
_time_device))
_warp_size, _time_device))
return -3;
nbor->cell_size(cell_size);

View File

@ -37,13 +37,16 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
const int gpu_nbor, const int gpu_host,
const bool pre_cut, const int block_cell_2d,
const int block_cell_id, const int block_nbor_build,
const int threads_per_atom, const bool time_device) {
const int threads_per_atom, const int warp_size,
const bool time_device) {
clear();
_threads_per_atom=threads_per_atom;
_block_cell_2d=block_cell_2d;
_block_cell_id=block_cell_id;
_max_block_nbor_build=block_nbor_build;
_block_nbor_build=block_nbor_build;
_warp_size=warp_size;
_shared=shared;
dev=&devi;
_gpu_nbor=gpu_nbor;
@ -418,6 +421,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
for (int i=0; i<_ncells; i++)
mn=std::max(mn,host_cell_counts[i]);
mn*=8;
set_nbor_block_size(mn/2);
resize_max_neighbors<numtyp,acctyp>(mn,success);
if (!success)
return;
@ -497,6 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
mn=host_acc[0];
for (int i=1; i<nt; i++)
mn=std::max(mn,host_acc[i]);
set_nbor_block_size(mn);
if (mn>_max_nbors) {
resize_max_neighbors<numtyp,acctyp>(mn,success);

View File

@ -67,7 +67,7 @@ class Neighbor {
const int gpu_nbor, const int gpu_host, const bool pre_cut,
const int block_cell_2d, const int block_cell_id,
const int block_nbor_build, const int threads_per_atom,
const bool time_device);
const int warp_size, const bool time_device);
/// Set the size of the cutoff+skin
inline void cell_size(const double size) { _cell_size=size; }
@ -237,12 +237,20 @@ class Neighbor {
double _gpu_bytes, _c_bytes, _cell_bytes;
void alloc(bool &success);
int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
int _threads_per_atom;
int _total_atoms;
int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
int _ncells, _threads_per_atom, _total_atoms;
template <class numtyp, class acctyp>
inline void resize_max_neighbors(const int maxn, bool &success);
int _warp_size;
inline void set_nbor_block_size(const int mn) {
int desired=mn/(2*_warp_size);
desired*=_warp_size;
if (desired<_warp_size) desired=_warp_size;
else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
_block_nbor_build=desired;
}
};
}

View File

@ -119,6 +119,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
int ix = BLOCK_ID_X;
int iy = BLOCK_ID_Y % ncelly;
int iz = BLOCK_ID_Y / ncelly;
int bsx = BLOCK_SIZE_X;
int icell = ix + iy*ncellx + iz*ncellx*ncelly;
@ -134,9 +135,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
numtyp4 diff;
numtyp r2;
int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/BLOCK_SIZE_X);
int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/bsx);
for (int ii = 0; ii < cap; ii++) {
int i = icell_begin + tid + ii*BLOCK_SIZE_X;
int i = icell_begin + tid + ii*bsx;
int pid_i = nall, pid_j, stride;
numtyp4 atom_i, atom_j;
int cnt = 0;
@ -173,14 +174,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
int num_atom_cell = jcell_end - jcell_begin;
// load jcell to shared memory
int num_iter = ucl_ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx);
for (int k = 0; k < num_iter; k++) {
int end_idx = min(BLOCK_NBOR_BUILD,
num_atom_cell-k*BLOCK_NBOR_BUILD);
int end_idx = min(bsx, num_atom_cell-k*bsx);
if (tid < end_idx) {
pid_j = cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
pid_j = cell_particle_id[tid+k*bsx+jcell_begin];
cell_list_sh[tid] = pid_j;
atom_j = fetch_pos(pid_j,x_); //[pid_j];
pos_sh[tid].x = atom_j.x;

View File

@ -100,6 +100,8 @@
#else
#if (ARCH < 300)
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 128
@ -107,6 +109,17 @@
#define BLOCK_BIO_PAIR 128
#define MAX_SHARED_TYPES 11
#else
#define THREADS_PER_ATOM 4
#define THREADS_PER_CHARGE 8
#define BLOCK_NBOR_BUILD 128
#define BLOCK_PAIR 512
#define BLOCK_BIO_PAIR 512
#define MAX_SHARED_TYPES 11
#endif
#endif
#define WARP_SIZE 32