git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7283 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-12-02 16:11:02 +00:00 · 2011-12-02 16:11:02 +00:00 · 2728aab0e5
parent 2e2d4c0b81
commit 2728aab0e5
4 changed files with 32 additions and 25 deletions
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@ -146,6 +146,10 @@ void Neighbor::alloc(bool &success) {
      ptr+=_max_nbors;
    }                                                 
    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
+  } else {
+    // Some OpenCL implementations return errors for NULL pointers as args
+    dev_host_nbor.view(dev_nbor);
+    dev_host_numj.view(dev_nbor);
  }
  if (_maxspecial>0) {
    dev_nspecial.clear();
@ -460,7 +464,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
        ptr+=mn;
      }                                                 
      _gpu_bytes+=dev_host_nbor.row_bytes();
-    }
+    } else
+      dev_host_nbor.view(dev_nbor);
    if (_alloc_packed) {
      dev_packed.clear();
      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -200,7 +200,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
                r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
                if (r2 < cell_size*cell_size && r2 > 1e-5) {
                  cnt++;
-                  if (cnt < neigh_bin_size) {
+                  if (cnt <= neigh_bin_size) {
                    *neigh_list = pid_j;
                    neigh_list++;
                    if ((cnt & (t_per_atom-1))==0)
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@ -50,7 +50,8 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
  	return;
  	
  _gpu_nbor=gpu_nbor;
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable -D"+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE)+" -D"+
                    std::string(OCL_VENDOR);

  if (_gpu_nbor==0) {
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@ -67,6 +67,22 @@

 #ifdef NV_KERNEL

+#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
+#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
+#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
+#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
+#define THREAD_ID_X threadIdx.x
+#define THREAD_ID_Y threadIdx.y
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_SIZE_X blockDim.x
+#define BLOCK_SIZE_Y blockDim.y
+#define __kernel extern "C" __global__
+#define __local __shared__
+#define __global  
+#define atom_add atomicAdd
+#define ucl_inline static __inline__ __device__ 
+
 #ifdef __CUDA_ARCH__
 #define ARCH __CUDA_ARCH__
 #else
@ -120,24 +136,7 @@ struct __builtin_align__(16) _double4
 typedef struct _double4 double4;
 #endif

-#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
-#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
-#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
-#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
-#define THREAD_ID_X threadIdx.x
-#define THREAD_ID_Y threadIdx.y
-#define BLOCK_ID_X blockIdx.x
-#define BLOCK_ID_Y blockIdx.y
-#define BLOCK_SIZE_X blockDim.x
-#define BLOCK_SIZE_Y blockDim.y
-#define __kernel extern "C" __global__
-#define __local __shared__
-#define __global  
-#define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__ 
-
-
-#ifndef _DOUBLE_DOUBLE
+#ifdef _DOUBLE_DOUBLE

 #define ucl_exp exp
 #define ucl_powr pow
@ -156,20 +155,18 @@ typedef struct _double4 double4;
 #define ucl_ceil ceilf
 #define ucl_abs fabsf
 #define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_rsqrt rsqrtf
+#define ucl_sqrt sqrtf

 #ifdef NO_HARDWARE_TRANSCENDENTALS

 #define ucl_exp expf
 #define ucl_powr powf
-#define ucl_rsqrt rsqrtf
-#define ucl_sqrt sqrtf

 #else

 #define ucl_exp __expf
 #define ucl_powr __powf
-#define ucl_rsqrt __rsqrtf
-#define ucl_sqrt __sqrtf

 #endif

@ -255,6 +252,10 @@ typedef struct _double4 double4;
 #define ucl_ceil ceil
 #define ucl_abs fabs

+#ifdef _DOUBLE_DOUBLE
+#define NO_HARDWARE_TRANSCENDENTALS
+#endif
+
 #ifdef NO_HARDWARE_TRANSCENDENTALS

 #define ucl_exp exp