git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7989 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-03-28 22:10:48 +00:00 · 2012-03-28 22:10:48 +00:00 · f3905bf229
parent f5d54c5ac1
commit f3905bf229
7 changed files with 53 additions and 15 deletions
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@ -77,6 +77,14 @@ class UCL_Device {
  /// Returns the stream indexed by i
  inline command_queue & cq(const int i) { return _cq[i]; }

+  /// Set the default command queue (by default this is the null stream)
+  /** \param i index of the command queue (as added by push_command_queue()) 
+      If i is 0, the default command queue is set to the null stream **/
+  inline void set_command_queue(const int i) {
+    if (i==0) _cq[0]=0;
+    else _cq[0]=_cq[i];
+  }
+  
  /// Block until all commands in the default stream have completed
  inline void sync() { sync(0); }
  
@ -127,7 +135,8 @@ class UCL_Device {
  /// Get the number of cores
  inline unsigned cores(const int i) 
    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
-      else return _properties[i].multiProcessorCount*32; }
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
+      else return _properties[i].multiProcessorCount*192; }
  
  /// Get the gigabytes of global memory in the current device
  inline double gigabytes() { return gigabytes(_device); }
@ -205,6 +214,7 @@ inline int UCL_Device::set(int num) {
  if (_device==num)
    return UCL_SUCCESS;
  for (int i=1; i<num_queues(); i++) pop_command_queue();
+  _cq[0]=0;
  cudaThreadExit();
  cudaError err=cudaSetDevice(_device_ids[num]);
  if (err!=cudaSuccess) {
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -151,7 +151,8 @@ class UCL_Device {
  /// Get the number of cores
  inline unsigned cores(const int i) 
    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
-      else return _properties[i].multiProcessorCount*32; }
+      else if (arch(i)<3.0) return _properties[i].multiProcessorCount*32;
+      else return _properties[i].multiProcessorCount*192; }
  
  /// Get the gigabytes of global memory in the current device
  inline double gigabytes() { return gigabytes(_device); }
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -124,7 +124,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
  if (my_gpu>=gpu->num_devices())
    return -2;
    
-  #ifndef CUDA_PRX
+  #ifndef CUDA_PROXY
  if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
    return -7;
  #endif
@ -203,7 +203,7 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
                  _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _time_device))
+                  _warp_size, _time_device))
    return -3;
  nbor->cell_size(cell_size);

--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -37,13 +37,16 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
                       const int gpu_nbor, const int gpu_host, 
                       const bool pre_cut, const int block_cell_2d,
                       const int block_cell_id, const int block_nbor_build,
-                       const int threads_per_atom, const bool time_device) {
+                       const int threads_per_atom, const int warp_size,
+                       const bool time_device) {
  clear();

  _threads_per_atom=threads_per_atom;
  _block_cell_2d=block_cell_2d;
  _block_cell_id=block_cell_id;
+  _max_block_nbor_build=block_nbor_build;
  _block_nbor_build=block_nbor_build;
+  _warp_size=warp_size;
  _shared=shared;
  dev=&devi;
  _gpu_nbor=gpu_nbor;
@ -418,6 +421,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    for (int i=0; i<_ncells; i++)
      mn=std::max(mn,host_cell_counts[i]);
    mn*=8;
+    set_nbor_block_size(mn/2);
+
    resize_max_neighbors<numtyp,acctyp>(mn,success);
    if (!success)
      return;
@ -497,6 +502,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    mn=host_acc[0];
    for (int i=1; i<nt; i++)
      mn=std::max(mn,host_acc[i]);
+    set_nbor_block_size(mn);

    if (mn>_max_nbors) {  
      resize_max_neighbors<numtyp,acctyp>(mn,success);
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -67,7 +67,7 @@ class Neighbor {
            const int gpu_nbor, const int gpu_host, const bool pre_cut,
            const int block_cell_2d, const int block_cell_id, 
            const int block_nbor_build, const int threads_per_atom,
-            const bool time_device);
+            const int warp_size, const bool time_device);

  /// Set the size of the cutoff+skin
  inline void cell_size(const double size) { _cell_size=size; }
@ -237,12 +237,20 @@ class Neighbor {
  double _gpu_bytes, _c_bytes, _cell_bytes;
  void alloc(bool &success);
  
-  int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
-  int _threads_per_atom;
-  int _total_atoms;
+  int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
+  int _ncells, _threads_per_atom, _total_atoms;

  template <class numtyp, class acctyp>
  inline void resize_max_neighbors(const int maxn, bool &success);
+  
+  int _warp_size;
+  inline void set_nbor_block_size(const int mn) {
+    int desired=mn/(2*_warp_size);
+    desired*=_warp_size;
+    if (desired<_warp_size) desired=_warp_size;
+    else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
+    _block_nbor_build=desired;
+  }
 };

 }
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -119,6 +119,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
  int ix = BLOCK_ID_X;
  int iy = BLOCK_ID_Y % ncelly;
  int iz = BLOCK_ID_Y / ncelly;
+  int bsx = BLOCK_SIZE_X;
 	  
  int icell = ix + iy*ncellx + iz*ncellx*ncelly;

@ -134,9 +135,9 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,

  numtyp4 diff;
  numtyp r2;
-  int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/BLOCK_SIZE_X);
+  int cap=ucl_ceil((numtyp)(icell_end - icell_begin)/bsx);
  for (int ii = 0; ii < cap; ii++) {
-    int i = icell_begin + tid + ii*BLOCK_SIZE_X;
+    int i = icell_begin + tid + ii*bsx;
    int pid_i = nall, pid_j, stride;
    numtyp4 atom_i, atom_j;
    int cnt = 0;    
@ -173,14 +174,13 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
          int num_atom_cell = jcell_end - jcell_begin;
 	  
          // load jcell to shared memory
-          int num_iter = ucl_ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
+          int num_iter = ucl_ceil((numtyp)num_atom_cell/bsx);

          for (int k = 0; k < num_iter; k++) {
-            int end_idx = min(BLOCK_NBOR_BUILD, 
-                              num_atom_cell-k*BLOCK_NBOR_BUILD);
+            int end_idx = min(bsx, num_atom_cell-k*bsx);
 	    
            if (tid < end_idx) {
-              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
+              pid_j =  cell_particle_id[tid+k*bsx+jcell_begin];
              cell_list_sh[tid] = pid_j;
              atom_j = fetch_pos(pid_j,x_); //[pid_j];
              pos_sh[tid].x = atom_j.x;
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -100,6 +100,8 @@

 #else

+#if (ARCH < 300)
+
 #define THREADS_PER_ATOM 4
 #define THREADS_PER_CHARGE 8
 #define BLOCK_NBOR_BUILD 128
@ -107,6 +109,17 @@
 #define BLOCK_BIO_PAIR 128
 #define MAX_SHARED_TYPES 11

+#else
+
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_PAIR 512
+#define BLOCK_BIO_PAIR 512
+#define MAX_SHARED_TYPES 11
+
+#endif
+
 #endif

 #define WARP_SIZE 32