Improvements in lib/gpu

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@3804 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2010-02-11 20:46:34 +00:00 · 2010-02-11 20:46:34 +00:00 · 71ed5c03ec
parent 8eaf196813
commit 71ed5c03ec
6 changed files with 180 additions and 363 deletions
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@ -27,29 +27,12 @@
 #include "lj_gpu_memory.cu"
 #include "lj_gpu_kernel.h"

-#ifdef WINDLL
-#include <windows.h>
-BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved)
-{
-    return TRUE;
-}
-#endif

-#ifdef WINDLL
-#define EXTERN extern "C" __declspec(dllexport) 
-#else
-#define EXTERN 
-#endif
-using namespace std;

 static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
 #define LJMT LJ_GPU_Memory<numtyp,acctyp>

-static float kernelTime = 0.0;
-static int ncell1D;
-static float *energy, *d_energy;
-static float3 *d_force, *f_temp, *v_temp, *d_virial;
-static cell_list cell_list_gpu;
+

 // ---------------------------------------------------------------------------
 // Convert something to a string
@ -93,8 +76,10 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
  bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
 		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);

-  ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
-
+  ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
+  ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
+  ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
+   
  init_cell_list_const(cell_size, skin, boxlo, boxhi);

  return ret;
@ -153,6 +138,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
 		    const int nall, const int ago, const bool eflag, const bool vflag, 
 		    const double *boxlo, const double *boxhi)
 {
+  cudaError_t err;
+  
  ljm.atom.nall(nall);
  ljm.atom.inum(inum);

@ -161,8 +148,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,

  double evdwl=0.0;

-  static int buffer = CELL_SIZE;
-  static int ncell = (int)pow((float)ncell1D,3);
+  static int blockSize = BLOCK_1D;
+  static int ncell = ncellx*ncelly*ncellz;

  static int first_call = 1;

@ -176,7 +163,7 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));

-    init_cell_list(cell_list_gpu, nall, ncell, buffer);
+    init_cell_list(cell_list_gpu, nall, ncell, blockSize);

    first_call = 0;
  }
@ -198,13 +185,13 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));

    clear_cell_list(cell_list_gpu);
-    init_cell_list(cell_list_gpu, nall, ncell, buffer);
+    init_cell_list(cell_list_gpu, nall, ncell, blockSize);
  }

  // build cell-list on GPU
  ljm.atom.time_atom.start();
  build_cell_list(host_x[0], host_type, cell_list_gpu, 
-		  ncell, ncell1D, buffer, inum, nall, ago);
+		  ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago);
  ljm.atom.time_atom.stop();

  ljm.time_pair.start();
@ -216,25 +203,32 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
  cudaEventRecord(start, 0);
 #endif

+#define KERNEL_LJ_CELL(e, v, b, s)     kernel_lj_cell<e,v,b><<<GX, BX, s>>> \
+                                       (d_force, d_energy, d_virial, \
+					cell_list_gpu.pos, \
+					cell_list_gpu.idx, \
+					cell_list_gpu.type, \
+					cell_list_gpu.natom, \
+					inum, nall, ncell, ncellx, ncelly, ncellz); 
+
  // call the cell-list force kernel
-  const int BX=BLOCK_1D;
-  dim3 GX(ncell1D, ncell1D*ncell1D);
+  const int BX=blockSize;
+  dim3 GX(ncellx, ncelly*ncellz);
+  
  if (eflag == 0 && vflag == 0) {
-    kernel_lj_cell<false,false><<<GX, BX, 0>>>
-      (d_force, d_energy, d_virial, 
-       cell_list_gpu.pos, 
-       cell_list_gpu.idx, 
-       cell_list_gpu.type, 
-       cell_list_gpu.natom,
-       inum, nall, ncell);
+    if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64,  0);
+    if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0);
+    if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0);    
  } else {
-    kernel_lj_cell<true,true><<<GX, BX, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES>>>
-      (d_force, d_energy, d_virial, 
-       cell_list_gpu.pos, 
-       cell_list_gpu.idx, 
-       cell_list_gpu.type, 
-       cell_list_gpu.natom,
-       inum, nall, ncell);
+    if (blockSize == 64)  KERNEL_LJ_CELL(true, true, 64,  3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
+    if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
+    if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
+  }
+  
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("LJ force kernel launch error: %d\n", err);
+    exit(1);
  }

 #ifdef TIMING
@ -296,122 +290,6 @@ EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *
 					       ago, eflag, vflag, boxlo, boxhi);
 }

-template <class numtyp, class acctyp>
-double _lj_gpu_n2(LJMT &ljm, double **force, double *virial,
-		  double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag,
-		  const double *boxlo, const double *boxhi)
-{
-  ljm.atom.nall(nall);
-  ljm.atom.inum(inum);
-
-
-  ljm.nbor.time_nbor.start();
-  ljm.nbor.time_nbor.stop();
-
-  
-  double evdwl=0.0;
-
-#ifdef NOUSE
-  static int first_call = 1;
-
-  if (first_call) {
-    energy    = (float*)    malloc(inum*sizeof(float));
-    v_temp    = (float3*)   malloc(inum*2*sizeof(float3));
-    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
-    cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
-    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
-    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
-    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
-    cudaMalloc((void**)&d_pos,       nall*sizeof(float3));
-    cudaMalloc((void**)&d_type,      nall*sizeof(int));
-    first_call = 0;
-  }
-
-
-  ljm.atom.time_atom.start();
-  double *atom_pos = host_x[0];
-  for (int i = 0; i < 3*nall; i+=3) { 
-    pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
-  }
-  cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_type, host_type, nall*sizeof(int),  cudaMemcpyHostToDevice);
-
-  ljm.atom.time_atom.stop();
-
-  ljm.time_pair.start();
-  
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=BLOCK_1D;
-  dim3 GX(static_cast<int>(ceil(static_cast<double>(inum)/BX)));
-
-#ifdef TIMING
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start, 0);
-#endif
-
-  // N^2 force kernel
-  kernel_lj_n2<numtyp, acctyp><<<GX, BX>>>(d_force, d_energy, d_virial, 
-					   d_pos, d_type, eflag, vflag, inum, nall);
-
-#ifdef TIMING
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-  float kTime;
-  cudaEventElapsedTime(&kTime, start, stop);
-  kernelTime += kTime;
-  printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-#endif
-
-  // copy results from GPU to CPU
-  cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
-  if (eflag) {
-    cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
-    for (int i = 0; i < inum; i++) {
-      evdwl += energy[i];
-    }
-    evdwl *= 0.5f;
-  }
-  if (vflag) {
-    cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
-    for (int i = 0; i < inum; i++) {
-      virial[0] += v_temp[2*i].x;
-      virial[1] += v_temp[2*i].y;
-      virial[2] += v_temp[2*i].z;
-      virial[3] += v_temp[2*i+1].x;
-      virial[4] += v_temp[2*i+1].y;
-      virial[5] += v_temp[2*i+1].z;
-    }
-    for (int i = 0; i < 6; i++) 
-      virial[i] *= 0.5f;
-  }
-
-  for (int i = 0; i < inum; i++) {
-    force[i][0] += f_temp[i].x;
-    force[i][1] += f_temp[i].y;
-    force[i][2] += f_temp[i].z;
-  }
-#endif
-  ljm.time_pair.stop();
-
-  ljm.atom.time_atom.add_to_total();
-  ljm.nbor.time_nbor.add_to_total();
-  ljm.time_pair.add_to_total();
-
-  return evdwl;
-}
-
-EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
-		 const bool eflag, const bool vflag,
-		 const double *boxlo, const double *boxhi) 
-{
-  return _lj_gpu_n2<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall, 
-					  eflag, vflag, boxlo, boxhi);
-}
-
 EXTERN void lj_gpu_time() {
  cout.precision(4);
  cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@ -21,25 +21,29 @@
 #define LJ_GPU_KERNEL

 /* Cell list version of LJ kernel */
-template<bool eflag, bool vflag>
+template<bool eflag, bool vflag, int blockSize>
 __global__ void kernel_lj_cell(float3 *force3,
 			       float *energy, float3 *virial, 
 			       float3 *cell_list, unsigned int *cell_idx, 
 			       int *cell_type, int *cell_atom,
-			       const int inum, const int nall, const int ncell)
+			       const int inum, const int nall, const int ncell, 
+			       const int ncellx, const int ncelly, const int ncellz)
 {
+	
+  
+	
  // calculate 3D block idx from 2d block
  int bx = blockIdx.x;
-  int by = blockIdx.y % gridDim.x;
-  int bz = blockIdx.y / gridDim.x;
+  int by = blockIdx.y % ncelly;
+  int bz = blockIdx.y / ncelly;

  int tid = threadIdx.x;
-
+  
  // compute cell idx from 3D block idx
-  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
-
-  __shared__ int   typeSh[CELL_SIZE];
-  __shared__ float posSh[CELL_SIZE*3];
+  int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
+  
+  __shared__ int typeSh[blockSize];
+  __shared__ float posSh[blockSize*3];
  __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@ -51,7 +55,7 @@ __global__ void kernel_lj_cell(float3 *force3,
  __shared__ float *offsetSh;

  // load force parameters into shared memory
-  for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
+  for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
    int itype = i/MAX_SHARED_TYPES;
    int jtype = i%MAX_SHARED_TYPES;
    cutsqSh[i] = _cutsq_<float>(itype,jtype);
@ -65,7 +69,7 @@ __global__ void kernel_lj_cell(float3 *force3,
    lj3Sh = smem;
    lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
    offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
-    for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
+    for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
      int itype = i/MAX_SHARED_TYPES;
      int jtype = i%MAX_SHARED_TYPES;
      lj3Sh[i]   = _lj3_<float>(itype,jtype).x+0.01;
@ -76,41 +80,41 @@ __global__ void kernel_lj_cell(float3 *force3,

  __syncthreads();

-  int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1),
-    nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1),
-    nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1);
+  int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1),
+      nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1),
+      nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1);

-  for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) {
+  for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) {
    float3 f = {0.0f, 0.0f, 0.0f};
    float ener = 0.0f;
    float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
    int itype;
    float ix, iy, iz;
-    int i = tid + ii*BLOCK_1D;
-    unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i];
+    int i = tid + ii*blockSize;
+    unsigned int answer_pos = cell_idx[cid*blockSize+i];

    // load current cell atom position and type into sMem
-    for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) {
-      int pid = cid*CELL_SIZE + j;
+    for (int j = tid; j < cell_atom[cid]; j += blockSize) {
+      int pid = cid*blockSize + j;
      float3 pos = cell_list[pid];
      posSh[j            ] = pos.x;
-      posSh[j+  CELL_SIZE] = pos.y;
-      posSh[j+2*CELL_SIZE] = pos.z;
+      posSh[j+  blockSize] = pos.y;
+      posSh[j+2*blockSize] = pos.z;
      typeSh[j]            = cell_type[pid];
    }
    __syncthreads();
    if (answer_pos < inum) {
      itype = typeSh[i];
      ix = posSh[i            ];
-      iy = posSh[i+  CELL_SIZE];
-      iz = posSh[i+2*CELL_SIZE];
+      iy = posSh[i+  blockSize];
+      iz = posSh[i+2*blockSize];

      // compute force from current cell
      for (int j = 0; j < cell_atom[cid]; j++) {
 	if (j == i) continue;
 	float delx = ix - posSh[j            ];
-	float dely = iy - posSh[j+  CELL_SIZE];
-	float delz = iz - posSh[j+2*CELL_SIZE];
+	float dely = iy - posSh[j+  blockSize];
+	float delz = iz - posSh[j+2*blockSize];
 	int jtype = typeSh[j];
 	int mtype = itype + jtype*MAX_SHARED_TYPES;
 	float r2inv = delx*delx + dely*dely + delz*delz;
@ -149,16 +153,16 @@ __global__ void kernel_lj_cell(float3 *force3,
 	  if (nborz == bz && nbory == by && nborx == bx) continue;
 	  
 	  // compute cell id
-	  int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) + 
-	    INT_MUL(nborz,gridDim.x*gridDim.x);
+	  int cid_nbor = nborx + INT_MUL(nbory,ncellx) + 
+	    INT_MUL(nborz,INT_MUL(ncellx,ncelly));
 	
 	  // load neighbor cell position and type into smem
-	  for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) {
-	    int pid = INT_MUL(cid_nbor,CELL_SIZE) + j;
+	  for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) {
+	    int pid = INT_MUL(cid_nbor,blockSize) + j;
 	    float3 pos = cell_list[pid];
 	    posSh[j            ] = pos.x;
-	    posSh[j+  CELL_SIZE] = pos.y;
-	    posSh[j+2*CELL_SIZE] = pos.z;
+	    posSh[j+  blockSize] = pos.y;
+	    posSh[j+2*blockSize] = pos.z;
 	    typeSh[j]           = cell_type[pid];
 	  }
 	  __syncthreads();
@ -166,8 +170,8 @@ __global__ void kernel_lj_cell(float3 *force3,
 	  if (answer_pos < inum) {
 	    for (int j = 0; j < cell_atom[cid_nbor]; j++) {
 	      float delx = ix - posSh[j           ];
-	      float dely = iy - posSh[j+  CELL_SIZE];
-	      float delz = iz - posSh[j+2*CELL_SIZE];
+	      float dely = iy - posSh[j+  blockSize];
+	      float delz = iz - posSh[j+2*blockSize];
 	      int jtype = typeSh[j];
 	      int mtype = itype + jtype*MAX_SHARED_TYPES;
 	      float r2inv = delx*delx + dely*dely + delz*delz;
@ -438,116 +442,4 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
  } // if ii
 }

-
-/* Brute force O(N^2) version of LJ kernel */
-template<class numtyp, class acctyp>
-  __global__ void kernel_lj_n2(float3 *force3,
-			       float *energy, float3 *virial,
-			       float3 *pos, int *type, 
-			       const bool eflag, const bool vflag, const int inum, const int nall)
-{
-  int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x);
-  int tid = threadIdx.x;
-  __shared__ float posSh[BLOCK_1D*3];
-  __shared__ int   typeSh[BLOCK_1D];
-  __shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=tid/MAX_SHARED_TYPES;
-    int jtype=tid%MAX_SHARED_TYPES;
-    cutsqSh[tid]=_cutsq_<numtyp>(itype,jtype);
-    lj1Sh[tid]=_lj1_<numtyp>(itype,jtype).x;
-    lj2Sh[tid]=_lj1_<numtyp>(itype,jtype).y;
-    lj3Sh[tid]=_lj3_<numtyp>(itype,jtype).x;
-    lj4Sh[tid]=_lj3_<numtyp>(itype,jtype).y;
-    offsetSh[tid]=_offset_<numtyp>(itype,jtype);
-  }
-  __syncthreads();
- 
-  float3 f = {0.0f, 0.0f, 0.0f};
-  float ener = 0.0f;
-  float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
-
-  int itype, jtype;
-  int mtype;
-
-  numtyp ix, iy, iz;
-
-  if (gid < inum) {
-    ix = pos[gid].x;
-    iy = pos[gid].y;
-    iz = pos[gid].z;
-    itype = type[gid];
-  }
-
-  int pid = tid;
-  int nIter = ceil((float)nall/BLOCK_1D);
-  for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) {
-
-    if (pid < nall) {
-      posSh[tid           ] = pos[pid].x;
-      posSh[tid+  BLOCK_1D] = pos[pid].y;
-      posSh[tid+2*BLOCK_1D] = pos[pid].z;
-      typeSh[tid]           = type[pid];
-    }
-    __syncthreads();
-
-    if (gid < inum) {
-      int pid_j = jj*BLOCK_1D;
-      
-      for (int j = 0; j < BLOCK_1D; j++, pid_j++) {
-	if (jj == blockIdx.x && tid == j) continue;
-	if (pid_j < nall) {
-	  numtyp delx = ix - posSh[j           ];
-	  numtyp dely = iy - posSh[j+  BLOCK_1D];
-	  numtyp delz = iz - posSh[j+2*BLOCK_1D];
-	  jtype = typeSh[j];
-	  mtype = itype + jtype*MAX_SHARED_TYPES;
-	  numtyp r2inv = delx * delx + dely * dely + delz * delz;
-	  
-	  if (r2inv < cutsqSh[mtype]) {
-	    r2inv = (numtyp)1.0/r2inv;
-	    numtyp r6inv = r2inv * r2inv * r2inv;
-	    numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
-	    f.x += delx * force;
-	    f.y += dely * force;
-	    f.z += delz * force;
-	    
-	    if (eflag) {
-	      numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
-	      ener +=(e-offsetSh[mtype]);
-	    }
-	    if (vflag) {
-	      v0.x += delx*delx*force;
-	      v0.y += dely*dely*force;
-	      v0.z += delz*delz*force;
-	      v1.x += delx*dely*force;
-	      v1.y += delx*delz*force;
-	      v1.z += dely*delz*force;
-	    }
-	  }
-	}
-      }
-    }
-    
-    __syncthreads();
-  }
-
-  if (gid < inum) {
-    if (eflag)
-      energy[gid] = ener;
-    if (vflag) {
-      virial[2*gid  ] = v0;
-      virial[2*gid+1] = v1;
-    }
-    force3[gid] = f;
-  }
-
-}
-
 #endif
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@ -25,8 +25,8 @@
 #include "pair_gpu_atom.h"
 #include "pair_gpu_nbor.h"

-#define BLOCK_1D 64
-#define CELL_SIZE 64
+#define BLOCK_1D 64           // max value = 256
+#define CELL_SIZE BLOCK_1D
 #define MAX_SHARED_TYPES 8
 #define PERCENT_GPU_MEMORY 0.7
 #define BIG_NUMBER 100000000
--- a/lib/gpu/pair_gpu_cell.cu
+++ b/lib/gpu/pair_gpu_cell.cu
@ -17,6 +17,7 @@
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */

+#include <assert.h>
 #include "lj_gpu_memory.h"
 #include "pair_gpu_cell.h"

@ -60,11 +61,14 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
 				       float3 *pos, 
 				       int *type, 
 				       const int inum, 
-				       const int nall)
+				       const int nall,
+				       const int cell_size)
 {
  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
  float cSize = d_cell_size[0];
-  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
+  int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);

  if (gid < nall) {
    float3 p = pos[gid];
@ -75,11 +79,11 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
    p.z = fmaxf(p.z, d_boxlo[2]-cSize);
    p.z = fminf(p.z, d_boxhi[2]+cSize);

-    int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D
-    		  + (int)(p.z/cSize + 1.0) * ncell1D * ncell1D;
+    int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx
+    		    + (int)(p.z/cSize + 1.0) * ncellx * ncelly;

    int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
-    int pid = cell_id*CELL_SIZE + atom_pos;
+    int pid = cell_id*cell_size + atom_pos;

    cell_list[pid] = pos[gid];
    cell_type[pid] = type[gid];
@ -92,18 +96,20 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
 {

  float cSize = d_cell_size[0];
-  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
+  int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);

  // calculate 3D block idx from 2d block
  int bx = blockIdx.x;
-  int by = blockIdx.y % gridDim.x;
-  int bz = blockIdx.y / gridDim.x;
+  int by = blockIdx.y % ncelly;
+  int bz = blockIdx.y / ncelly;

  int tid = threadIdx.x;

  // compute cell idx from 3D block idx
-  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
-  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
+  int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
+  int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list

  float skin = d_skin[0];
  float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
@ -113,7 +119,7 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
  float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
  float hiz  = lowz + cSize + skin;

-  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
+  for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
    int pid = pbase + i;
    float3 p = cell_list[pid];
    p.x = fmaxf(p.x, d_boxlo[0]-cSize);
@ -136,25 +142,30 @@ __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int nc
  unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;

  if (gid < ncell) {
-    if (cell_atom[gid] > CELL_SIZE) 
+    if (cell_atom[gid] > blockDim.x) 
      *overflow = 1;
  }
 }

 __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
 {
+  float cSize = d_cell_size[0];
+  int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
+  int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
+
  // calculate 3D block idx from 2d block
  int bx = blockIdx.x;
-  int by = blockIdx.y % gridDim.x;
-  int bz = blockIdx.y / gridDim.x;
+  int by = blockIdx.y % ncelly;
+  int bz = blockIdx.y / ncelly;

  int tid = threadIdx.x;

  // compute cell idx from 3D block idx
-  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
-  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
+  int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
+  int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list

-  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
+  for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
    int pid = pbase + i;
    cell_list[pid] = pos[cell_idx[pid]];
  }
@ -164,17 +175,7 @@ __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int

 __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit); 

-void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size)
-{
-  int i = 0;
-  const unsigned int bitSize = sizeof(unsigned int)*8;
-  const unsigned int bitStep = 4;
-  const int gSize = size/BLOCK_1D;
-  while (bitSize > i*bitStep) {
-    radixSortBlocks<<<gSize, BLOCK_1D, 2*BLOCK_1D*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
-    i++;
-  }
-}
+

 #ifdef __DEVICE_EMULATION__
 #define __SYNC __syncthreads();
@ -253,7 +254,7 @@ __device__ unsigned int rank(unsigned int preds)
    unsigned int address = scan(preds);  

    __shared__ unsigned int numtrue;
-    if (threadIdx.x == BLOCK_1D - 1)
+    if (threadIdx.x == blockDim.x - 1)
    {
        numtrue = address + preds;
    }
@ -266,11 +267,12 @@ __device__ unsigned int rank(unsigned int preds)
    return rank;
 }

+template<int blockSize>
 __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
 {
  extern __shared__ unsigned int sMem1[];
-  __shared__ float sMem2[BLOCK_1D];
-  __shared__ int sMem3[BLOCK_1D];
+  __shared__ float sMem2[blockSize];
+  __shared__ int sMem3[blockSize];

  int tid = threadIdx.x;

@ -314,7 +316,11 @@ __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, u

 }

-__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit)
+__global__ void radixSortBlocks(unsigned int *keys, 
+				float3 *values1, 
+				int *values2, 
+				unsigned int nbits, 
+				unsigned int startbit)
 {

  extern __shared__ unsigned int sMem[];
@ -328,13 +334,30 @@ __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values
  value2 = values2[gid];
  __syncthreads();

-  radixSortBlock(&key, &value1, &value2, nbits, startbit);
+  if (blockDim.x == 64) 
+    radixSortBlock<64>(&key, &value1, &value2, nbits, startbit);
+  else if (blockDim.x == 128) 
+    radixSortBlock<128>(&key, &value1, &value2, nbits, startbit);
+  else if (blockDim.x == 256)
+    radixSortBlock<256>(&key, &value1, &value2, nbits, startbit);

  keys[gid] = key;
  values1[gid] = value1;
  values2[gid] = value2;
 }

+void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size)
+{
+  int i = 0;
+  const unsigned int bitSize = sizeof(unsigned int)*8;
+  const unsigned int bitStep = 4;
+  const int gSize = size/cell_size;
+  while (bitSize > i*bitStep) {
+    radixSortBlocks<<<gSize, cell_size, 2*cell_size*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
+    i++;
+  }
+}
+
 static float3 *d_pos, *pos_temp;
 static int *d_type;
 static int *d_overflow, *d_rebuild;
@ -376,9 +399,12 @@ void clear_cell_list(cell_list &cell_list_gpu)

 void build_cell_list(double *atom_pos, int *atom_type, 
 		     cell_list &cell_list_gpu, 
-		     const int ncell, const int ncell1D, const int buffer,
-		     const int inum, const int nall, const int ago)
+		     const int ncell, const int ncellx, const int ncelly, const int ncellz, 
+		     const int buffer, const int inum, const int nall, const int ago)
 {
+
+  cudaError_t err;				     
+
  cudaMemset(d_overflow, 0, sizeof(int));
  cudaMemset(d_rebuild, 0, sizeof(int));

@ -394,28 +420,24 @@ void build_cell_list(double *atom_pos, int *atom_type,

  // copy the last built cell-list and test whether it needs to be rebuilt
  if (!first_build) {
-    dim3 block(BLOCK_1D);
-    dim3 grid(ncell1D, ncell1D*ncell1D);
-    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
+    
+    dim3 grid(ncellx, ncelly*ncellz);
+    kernel_copy_list<<<grid, buffer>>>(cell_list_gpu.pos, 
 				 cell_list_gpu.idx, 
 				 cell_list_gpu.natom, d_pos);
    cudaMemset(d_rebuild, 0, sizeof(int));
    int *temp = (int*)malloc(sizeof(int)*ncell);
-    kernel_test_rebuild<<<grid, block>>>(cell_list_gpu.pos, 
+    kernel_test_rebuild<<<grid, buffer>>>(cell_list_gpu.pos, 
 					 cell_list_gpu.natom,
 					 d_rebuild);
    cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    err = cudaGetLastError();
+    assert(err == cudaSuccess);
  }

-  /*if (!first_build) {
-    dim3 block(BLOCK_1D);
-    dim3 grid(ncell1D, ncell1D*ncell1D);
-    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
-				      cell_list_gpu.idx, 
-				      cell_list_gpu.natom, d_pos);
-				      }*/
  if (ago == 0) rebuild = 1;
-
+  
  // build cell-list for the first time
  if (first_build || rebuild) {
    first_build = 0;
@ -431,24 +453,32 @@ void build_cell_list(double *atom_pos, int *atom_type,
 						  cell_list_gpu.idx, 
 						  cell_list_gpu.type, 
 						  cell_list_gpu.natom, 
-						  d_pos, d_type, inum, nall);
-    
+						  d_pos, d_type, inum, nall, buffer);
+    err = cudaGetLastError();
+    assert(err == cudaSuccess);
    // check cell list overflow
-    int overflow;
-    int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/BLOCK_1D));
-    kernel_test_overflow<<<gDimCell, BLOCK_1D>>>(cell_list_gpu.natom, 
-						 d_overflow, ncell);
+    int overflow = 0;
+    int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/buffer));
+    kernel_test_overflow<<<gDimCell, buffer>>>(cell_list_gpu.natom, 
+					       d_overflow, ncell);
    cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
+     
    if (overflow > 0) {
-      printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n");
+      printf("\n BLOCK_1D too small for cell list, please increase it!");
+      printf("\n BLOCK_1D = %d",BLOCK_1D);
+      printf("\n ncell = %d",ncell);
+      printf("\n gDimCell = %d",gDimCell);
+      printf("\n overflow = %d \n",overflow);
      exit(0);
    }
-
+    
    // sort atoms in every cell by atom index to avoid floating point associativity problem.
    sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos, 
-	       cell_list_gpu.type, ncell*buffer);
+	       cell_list_gpu.type, ncell*buffer, buffer);

    cudaThreadSynchronize();
+    err = cudaGetLastError();
+    assert(err == cudaSuccess);
  }

 }
--- a/lib/gpu/pair_gpu_cell.h
+++ b/lib/gpu/pair_gpu_cell.h
@ -20,6 +20,22 @@
 #ifndef PAIR_GPU_CELL_H
 #define PAIR_GPU_CELL_H

+#ifdef WINDLL
+#include <windows.h>
+#endif
+
+#ifdef WINDLL
+#define EXTERN extern "C" __declspec(dllexport) 
+#else
+#define EXTERN 
+#endif
+using namespace std;
+
+static float kernelTime = 0.0;
+static int ncellx, ncelly, ncellz;
+static float *energy, *d_energy;
+static float3 *d_force, *f_temp, *v_temp, *d_virial;
+

 typedef struct {
  float3 *pos;
@ -28,6 +44,8 @@ typedef struct {
  int *natom;
 } cell_list;

+static cell_list cell_list_gpu;
+
 __global__ void kernel_set_cell_list(unsigned int *cell_idx);
 __global__ void kernel_build_cell_list(float3 *cell_list, 
 				       unsigned int *cell_idx, 
@ -54,8 +72,8 @@ void init_cell_list(cell_list &cell_list_gpu,

 void build_cell_list(double *atom_pos, int *atom_type, 
 		    cell_list &cell_list_gpu, 
-		    const int ncell, const int ncell1D, const int buffer,
-		    const int inum, const int nall, const int ago);
+		    const int ncell, const int ncellx, const int ncelly, const int ncellz, 
+		    const int buffer, const int inum, const int nall, const int ago);

 void clear_cell_list(cell_list &cell_list_gpu);

--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@ -24,7 +24,6 @@
 #include "nvc_timer.h"
 #include "nvc_memory.h"

-#define BLOCK_1D 64
 #define IJ_SIZE 131072

 class PairGPUNbor {