git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7283 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2011-12-02 16:11:02 +00:00
parent 2e2d4c0b81
commit 2728aab0e5
4 changed files with 32 additions and 25 deletions

View File

@ -146,6 +146,10 @@ void Neighbor::alloc(bool &success) {
ptr+=_max_nbors;
}
_c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
} else {
// Some OpenCL implementations return errors for NULL pointers as args
dev_host_nbor.view(dev_nbor);
dev_host_numj.view(dev_nbor);
}
if (_maxspecial>0) {
dev_nspecial.clear();
@ -460,7 +464,8 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
ptr+=mn;
}
_gpu_bytes+=dev_host_nbor.row_bytes();
}
} else
dev_host_nbor.view(dev_nbor);
if (_alloc_packed) {
dev_packed.clear();
success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,

View File

@ -200,7 +200,7 @@ __kernel void calc_neigh_list_cell(__global numtyp4 *x_,
r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
if (r2 < cell_size*cell_size && r2 > 1e-5) {
cnt++;
if (cnt < neigh_bin_size) {
if (cnt <= neigh_bin_size) {
*neigh_list = pid_j;
neigh_list++;
if ((cnt & (t_per_atom-1))==0)

View File

@ -50,7 +50,8 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor) {
return;
_gpu_nbor=gpu_nbor;
std::string flags="-cl-fast-relaxed-math -cl-mad-enable -D"+
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
std::string(OCL_PRECISION_COMPILE)+" -D"+
std::string(OCL_VENDOR);
if (_gpu_nbor==0) {

View File

@ -67,6 +67,22 @@
#ifdef NV_KERNEL
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#define __local __shared__
#define __global
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__
#ifdef __CUDA_ARCH__
#define ARCH __CUDA_ARCH__
#else
@ -120,24 +136,7 @@ struct __builtin_align__(16) _double4
typedef struct _double4 double4;
#endif
#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_SIZE_X blockDim.x
#define BLOCK_SIZE_Y blockDim.y
#define __kernel extern "C" __global__
#define __local __shared__
#define __global
#define atom_add atomicAdd
#define ucl_inline static __inline__ __device__
#ifndef _DOUBLE_DOUBLE
#ifdef _DOUBLE_DOUBLE
#define ucl_exp exp
#define ucl_powr pow
@ -156,20 +155,18 @@ typedef struct _double4 double4;
#define ucl_ceil ceilf
#define ucl_abs fabsf
#define ucl_recip(x) ((numtyp)1.0/(x))
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#ifdef NO_HARDWARE_TRANSCENDENTALS
#define ucl_exp expf
#define ucl_powr powf
#define ucl_rsqrt rsqrtf
#define ucl_sqrt sqrtf
#else
#define ucl_exp __expf
#define ucl_powr __powf
#define ucl_rsqrt __rsqrtf
#define ucl_sqrt __sqrtf
#endif
@ -255,6 +252,10 @@ typedef struct _double4 double4;
#define ucl_ceil ceil
#define ucl_abs fabs
#ifdef _DOUBLE_DOUBLE
#define NO_HARDWARE_TRANSCENDENTALS
#endif
#ifdef NO_HARDWARE_TRANSCENDENTALS
#define ucl_exp exp