git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7885 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-03-02 15:57:05 +00:00 · 2012-03-02 15:57:05 +00:00 · d30ba2d7eb
parent 17091c755e
commit d30ba2d7eb
9 changed files with 193 additions and 122 deletions
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@ -1,2 +1 @@
-Geryon Version 11.094
- 
+Geryon Version 12.034
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -117,6 +117,14 @@ class UCL_Device {
    _cq.pop_back();
  }
  
+  /// Set the default command queue (by default this is the null stream)
+  /** \param i index of the command queue (as added by push_command_queue()) 
+      If i is 0, the default command queue is set to the null stream **/
+  inline void set_command_queue(const int i) {
+    if (i==0) _cq[0]=0;
+    else _cq[0]=_cq[i];
+  }
+  
  /// Get the current CUDA device name
  inline std::string name() { return name(_device); }
  /// Get the CUDA device name
@ -280,6 +288,7 @@ inline int UCL_Device::set(int num) {
  if (_device>-1) {
    CU_SAFE_CALL_NS(cuCtxDestroy(_context));
    for (int i=1; i<num_queues(); i++) pop_command_queue();
+    _cq[0]=0;
  }
  _device=_properties[num].device_id;
  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -34,11 +34,11 @@ class UCL_Texture;
 /// Class storing 1 or more kernel functions from a single string or file
 class UCL_Program {
 public:
-  inline UCL_Program(UCL_Device &device) {}
+  inline UCL_Program(UCL_Device &device) { _cq=device.cq(); }
  inline ~UCL_Program() {}

  /// Initialize the program with a device
-  inline void init(UCL_Device &device) { }
+  inline void init(UCL_Device &device) { _cq=device.cq(); }

  /// Clear any data associated with program
  /** \note Must call init() after each clear **/
@ -130,6 +130,7 @@ class UCL_Program {
  friend class UCL_Kernel;
 private:
  CUmodule _module;
+  CUstream _cq;
  friend class UCL_Texture;
 };

@ -141,7 +142,7 @@ class UCL_Kernel {
  
  UCL_Kernel(UCL_Program &program, const char *function) : 
    _dimensions(1), _num_args(0), _param_size(0) 
-    { _num_blocks[0]=0; set_function(program,function); }
+    { _num_blocks[0]=0; set_function(program,function); _cq=program._cq; }
  
  ~UCL_Kernel() {}

@ -160,6 +161,7 @@ class UCL_Kernel {
      #endif
      return UCL_FUNCTION_NOT_FOUND;
    }
+    _cq=program._cq;
    return UCL_SUCCESS;
  }

@ -229,7 +231,7 @@ class UCL_Kernel {
  /// Run the kernel in the default command queue
  inline void run() {
    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
-    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
+    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],_cq));
  }
  
  /// Run the kernel in the specified command queue
@ -245,6 +247,7 @@ class UCL_Kernel {

 private:
  CUfunction _kernel;
+  CUstream _cq;
  unsigned _dimensions;
  unsigned _num_blocks[2];
  unsigned _num_args;
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@ -57,6 +57,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
  return UCL_SUCCESS;
 }

@ -72,6 +73,7 @@ inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
  return UCL_SUCCESS;
 }

@ -92,6 +94,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
  CUresult err=cuMemAlloc(&mat.cbegin(),n);
  if (err!=CUDA_SUCCESS)
    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
  return UCL_SUCCESS;
 }

@ -101,6 +104,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
  CUresult err=cuMemAlloc(&mat.cbegin(),n);
  if (err!=CUDA_SUCCESS)
    return UCL_MEMORY_ERROR;
+  mat.cq()=dev.cq();
  return UCL_SUCCESS;
 }

@ -115,6 +119,7 @@ inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
  pitch=static_cast<size_t>(upitch);                               
  if (err!=CUDA_SUCCESS)
    return UCL_MEMORY_ERROR;
+  mat.cq()=cm.cq();
  return UCL_SUCCESS;
 }    

@ -129,6 +134,7 @@ inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
  pitch=static_cast<size_t>(upitch);                               
  if (err!=CUDA_SUCCESS)
    return UCL_MEMORY_ERROR;
+  mat.cq()=d.cq();
  return UCL_SUCCESS;
 }    

@ -243,8 +249,8 @@ template<> struct _ucl_memcpy<2,2> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstArray=dst.cbegin();
    ins.srcArray=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -255,8 +261,8 @@ template<> struct _ucl_memcpy<2,2> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstArray=dst.cbegin();
    ins.srcArray=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -280,8 +286,8 @@ template<> struct _ucl_memcpy<2,0> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstArray=dst.cbegin();
    ins.srcDevice=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -292,8 +298,8 @@ template<> struct _ucl_memcpy<2,0> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstArray=dst.cbegin();
    ins.srcDevice=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -317,8 +323,8 @@ template<> struct _ucl_memcpy<2,1> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstArray=dst.cbegin();
    ins.srcHost=src.begin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -329,8 +335,8 @@ template<> struct _ucl_memcpy<2,1> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstArray=dst.cbegin();
    ins.srcHost=src.begin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -354,8 +360,8 @@ template<> struct _ucl_memcpy<0,2> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstDevice=dst.cbegin();
    ins.srcArray=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -366,8 +372,8 @@ template<> struct _ucl_memcpy<0,2> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstDevice=dst.cbegin();
    ins.srcArray=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -391,8 +397,8 @@ template<> struct _ucl_memcpy<1,2> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstHost=dst.begin();
    ins.srcArray=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -403,8 +409,8 @@ template<> struct _ucl_memcpy<1,2> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstHost=dst.begin();
    ins.srcArray=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -428,8 +434,8 @@ template <> struct _ucl_memcpy<1,0> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstHost=dst.begin();
    ins.srcDevice=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -440,8 +446,8 @@ template <> struct _ucl_memcpy<1,0> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstHost=dst.begin();
    ins.srcDevice=src.cbegin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -465,8 +471,8 @@ template <> struct _ucl_memcpy<0,1> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstDevice=dst.cbegin();
    ins.srcHost=src.begin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -477,8 +483,8 @@ template <> struct _ucl_memcpy<0,1> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstDevice=dst.cbegin();
    ins.srcHost=src.begin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -500,8 +506,8 @@ template <> struct _ucl_memcpy<1,1> {
                            const size_t rows) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstHost=dst.begin();
    ins.srcHost=src.begin();
    CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -512,8 +518,8 @@ template <> struct _ucl_memcpy<1,1> {
                            const size_t rows, CUstream &cq) {
    CUDA_MEMCPY2D ins;
    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
    ins.dstHost=dst.begin();
    ins.srcHost=src.begin();
    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
@ -529,7 +535,7 @@ template <int mem1, int mem2> struct _ucl_memcpy {
  template <class p1, class p2>
  static inline void mc(p1 &dst, const p2 &src, const size_t n,
                        CUstream &cq) {
-    CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
+    CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin(),src.cbegin(),n,cq));
  }
  template <class p1, class p2>
      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
@ -546,8 +552,8 @@ template <int mem1, int mem2> struct _ucl_memcpy {
    } else {                                       
      CUDA_MEMCPY2D ins;
      _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
      ins.dstDevice=dst.cbegin();
      ins.srcDevice=src.cbegin();
      CU_SAFE_CALL(cuMemcpy2D(&ins));
@ -560,16 +566,16 @@ template <int mem1, int mem2> struct _ucl_memcpy {
    if (p1::PADDED==0 || p2::PADDED==0) {
      size_t src_offset=0, dst_offset=0;
      for (size_t i=0; i<rows; i++) {                       
-        CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
-                                  src.cbegin()+src_offset,cols));
+        CU_SAFE_CALL(cuMemcpyDtoDAsync(dst.cbegin()+dst_offset,
+                                       src.cbegin()+src_offset,cols,cq));
        src_offset+=spitch;
        dst_offset+=dpitch;
      }
    } else {
      CUDA_MEMCPY2D ins;
      _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
-      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
-      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();
+      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();
      ins.dstDevice=dst.cbegin();
      ins.srcDevice=src.cbegin();
      CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -124,16 +124,25 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica,
  if (my_gpu>=gpu->num_devices())
    return -2;
    
+  #ifndef CUDA_PRX
  if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
    return -7;
+  #endif
  
  if (gpu->set(my_gpu)!=UCL_SUCCESS)
    return -6;

+  gpu->push_command_queue();
+  gpu->set_command_queue(1);
+
  _long_range_precompute=0;

-  int flag=compile_kernels();
-
+  int flag=0;
+  for (int i=0; i<_procs_per_gpu; i++) {
+    if (_gpu_rank==i)
+      flag=compile_kernels();
+    gpu_barrier();
+  }
  return flag;
 }

--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@ -348,8 +348,9 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
  // copy fp from device to host for comm
  _nlocal=nlocal;
  time_fp1.start();
-  ucl_copy(host_fp,dev_fp,nlocal,false);
+  ucl_copy(host_fp,dev_fp,nlocal,true);
  time_fp1.stop();
+  time_fp1.sync_stop();
 }

 // ---------------------------------------------------------------------------
@ -427,8 +428,9 @@ int** EAMT::compute(const int ago, const int inum_full, const int nall,
  // copy fp from device to host for comm
  _nlocal=inum_full;
  time_fp1.start();
-  ucl_copy(host_fp,dev_fp,inum_full,false);
+  ucl_copy(host_fp,dev_fp,inum_full,true);
  time_fp1.stop();
+  time_fp1.sync_stop();
  
  return this->nbor->host_jlist.begin()-host_start;
 }
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -69,10 +69,12 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
  time_kernel.init(*dev);
  time_hybrid1.init(*dev);
  time_hybrid2.init(*dev);
+  time_transpose.init(*dev);
  time_nbor.zero();
  time_kernel.zero();
  time_hybrid1.zero();
  time_hybrid2.zero();
+  time_transpose.zero();

  _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
  if (_max_atoms==0)
@ -203,6 +205,7 @@ void Neighbor::clear() {
    time_nbor.clear();
    time_hybrid1.clear();
    time_hybrid2.clear();
+    time_transpose.clear();
  }
 }

@ -285,6 +288,41 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
  }
 }

+template <class numtyp, class acctyp>
+void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
+  if (maxn>_max_nbors) {  
+    int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
+    dev_nbor.clear();
+    success=success && 
+            (dev_nbor.alloc((mn+1)*_max_atoms,*dev)==UCL_SUCCESS);
+    _gpu_bytes=dev_nbor.row_bytes();
+    if (_max_host>0) {
+      host_nbor.clear();
+      dev_host_nbor.clear();
+      success=success && (host_nbor.alloc(mn*_max_host,*dev,
+                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
+      success=success && (dev_host_nbor.alloc(mn*_max_host,
+                                        *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
+      int *ptr=host_nbor.begin();
+      for (int i=0; i<_max_host; i++) {
+        host_jlist[i]=ptr;
+        ptr+=mn;
+      }                                                 
+      _gpu_bytes+=dev_host_nbor.row_bytes();
+    } else {
+      dev_host_nbor.view(dev_nbor);
+      dev_host_numj.view(dev_nbor);
+    }
+    if (_alloc_packed) {
+      dev_packed.clear();
+      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
+                                           UCL_READ_ONLY)==UCL_SUCCESS);
+      _gpu_bytes+=dev_packed.row_bytes();
+    }
+    _max_nbors=mn;
+  }
+}
+
 template <class numtyp, class acctyp>
 void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
                               const int nall, Atom<numtyp,acctyp> &atom, 
@ -320,6 +358,29 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,

  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);

+  if (_maxspecial>0) {
+    time_nbor.start();
+    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
+    view_nspecial.view(nspecial[0],nt*3,*dev);
+    view_special.view(special[0],nt*_maxspecial,*dev);
+    view_tag.view(tag,nall,*dev);
+    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
+    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
+    ucl_copy(atom.dev_tag,view_tag,nall,false);
+    time_nbor.stop();
+    if (_time_device)
+      time_nbor.add_to_total();
+    time_transpose.start();
+    const int b2x=_block_cell_2d;
+    const int b2y=_block_cell_2d;
+    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
+    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
+    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
+                             &_maxspecial,&nt);        
+    time_transpose.stop();
+  }
+  
  // If binning on CPU, do this now
  if (_gpu_nbor==2) {
    double stime = MPI_Wtime();
@ -352,6 +413,16 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
      cell_id[i]=id;
      host_cell_counts[id+1]++;
    }
+    
+    mn=0;
+    for (int i=0; i<_ncells; i++)
+      mn=std::max(mn,host_cell_counts[i]);
+    mn*=8;
+    resize_max_neighbors<numtyp,acctyp>(mn,success);
+    if (!success)
+      return;
+    _total_atoms=nt;
+
    cell_iter[0]=0;
    for (int i=1; i<_ncells; i++) {
      host_cell_counts[i]+=host_cell_counts[i-1];
@ -372,28 +443,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
    _bin_time+=MPI_Wtime()-stime;
  }        

-  if (_maxspecial>0) {
-    time_nbor.start();
-    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
-    view_nspecial.view(nspecial[0],nt*3,*dev);
-    view_special.view(special[0],nt*_maxspecial,*dev);
-    view_tag.view(tag,nall,*dev);
-    ucl_copy(dev_nspecial,view_nspecial,nt*3,false);
-    ucl_copy(dev_special_t,view_special,nt*_maxspecial,false);
-    ucl_copy(atom.dev_tag,view_tag,nall,false);
-    time_nbor.stop();
-    if (_time_device)
-      time_nbor.add_to_total();
-    time_kernel.start();
-    const int b2x=_block_cell_2d;
-    const int b2y=_block_cell_2d;
-    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
-    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
-    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
-                             &_maxspecial,&nt);        
-  } else
-    time_kernel.start();
+  time_kernel.start();

  _nbor_pitch=inum;
  _shared->neigh_tex.bind_float(atom.dev_x,4);
@ -435,54 +485,30 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  /* Get the maximum number of nbors and realloc if necessary */
  UCL_D_Vec<int> numj;
  numj.view_offset(inum,dev_nbor,inum);
-  ucl_copy(host_acc,numj,inum,false);
+  ucl_copy(host_acc,numj,inum,true);
  if (nt>inum) {
    UCL_H_Vec<int> host_offset;
    host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
+    ucl_copy(host_offset,dev_host_numj,nt-inum,true);
  }
-  mn=host_acc[0];
-  for (int i=1; i<nt; i++)
-    mn=std::max(mn,host_acc[i]);
+  
+  if (_gpu_nbor!=2) {
+    host_acc.sync();
+    mn=host_acc[0];
+    for (int i=1; i<nt; i++)
+      mn=std::max(mn,host_acc[i]);

-  if (mn>_max_nbors) {  
-    mn=static_cast<int>(static_cast<double>(mn)*1.10);
-    dev_nbor.clear();
-    success=success && 
-            (dev_nbor.alloc((mn+1)*_max_atoms,atom.dev_x)==UCL_SUCCESS);
-    _gpu_bytes=dev_nbor.row_bytes();
-    if (_max_host>0) {
-      host_nbor.clear();
-      dev_host_nbor.clear();
-      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
-                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc(mn*_max_host,
-                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
-      int *ptr=host_nbor.begin();
-      for (int i=0; i<_max_host; i++) {
-        host_jlist[i]=ptr;
-        ptr+=mn;
-      }                                                 
-      _gpu_bytes+=dev_host_nbor.row_bytes();
-    } else {
-      dev_host_nbor.view(dev_nbor);
-      dev_host_numj.view(dev_nbor);
-    }
-    if (_alloc_packed) {
-      dev_packed.clear();
-      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
-                                           UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_packed.row_bytes();
-    }
-    if (!success)
+    if (mn>_max_nbors) {  
+      resize_max_neighbors<numtyp,acctyp>(mn,success);
+      if (!success)
+        return;
+      time_kernel.stop();
+      if (_time_device)
+        time_kernel.add_to_total();
+      build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
+                      nspecial, special, success, mn);
      return;
-    _max_nbors=mn;
-    time_kernel.stop();
-    if (_time_device)
-      time_kernel.add_to_total();
-    build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
-                    special, success, mn);
-    return;
+    }
  }
  
  if (_maxspecial>0) {
@ -497,8 +523,10 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
  time_kernel.stop();

  time_nbor.start();
-  if (inum<nt)
-    ucl_copy(host_nbor,dev_host_nbor,false);
+  if (inum<nt) {
+    ucl_copy(host_nbor,dev_host_nbor,true);
+    host_nbor.sync();
+  }
  time_nbor.stop();
 }

--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@ -106,13 +106,24 @@ class Neighbor {

  inline void acc_timers() {
    if (_nbor_time_avail) {
-      time_nbor.add_to_total();
-      time_kernel.add_to_total();
      if (_gpu_nbor==2) {
-        time_hybrid1.add_to_total();
-        time_hybrid2.add_to_total();
+        int mn=0;
+        for (int i=0; i<_total_atoms; i++)
+          mn=std::max(mn,host_acc[i]);
+        if (mn>_max_nbors)
+          assert(0==1);
+      }
+      if (_time_device) {
+        time_nbor.add_to_total();
+        time_kernel.add_to_total();
+        if (_gpu_nbor==2) {
+          time_hybrid1.add_to_total();
+          time_hybrid2.add_to_total();
+        }
+        if (_maxspecial>0)
+          time_transpose.add_to_total();
+        _nbor_time_avail=false;
      }
-      _nbor_time_avail=false;
    }
  }

@ -213,7 +224,7 @@ class Neighbor {
  UCL_D_Vec<int> dev_cell_counts;

  /// Device timers
-  UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2;
+  UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
  
 private:
  NeighborShared *_shared;
@ -228,6 +239,10 @@ class Neighbor {
  
  int _block_cell_2d, _block_cell_id, _block_nbor_build, _ncells;
  int _threads_per_atom;
+  int _total_atoms;
+
+  template <class numtyp, class acctyp>
+  inline void resize_max_neighbors(const int maxn, bool &success);
 };

 }
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -85,7 +85,7 @@ __kernel void kernel_calc_cell_counts(unsigned *cell_id,
 __kernel void transpose(__global int *out, __global int *in, int columns_in, 
                        int rows_in)
 {
-	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
+	__local int block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	
 	unsigned ti=THREAD_ID_X;
 	unsigned tj=THREAD_ID_Y;