diff --git a/lib/gpu/lj_gpu.cu b/lib/gpu/lj_gpu.cu
index e8791069d5..e31321ad2e 100644
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@@ -27,29 +27,12 @@
 #include "lj_gpu_memory.cu"
 #include "lj_gpu_kernel.h"
 
-#ifdef WINDLL
-#include <windows.h>
-BOOL APIENTRY DllMain(HANDLE hModule, DWORD dwReason, LPVOID lpReserved)
-{
-    return TRUE;
-}
-#endif
 
-#ifdef WINDLL
-#define EXTERN extern "C" __declspec(dllexport) 
-#else
-#define EXTERN 
-#endif
-using namespace std;
 
 static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
 #define LJMT LJ_GPU_Memory<numtyp,acctyp>
 
-static float kernelTime = 0.0;
-static int ncell1D;
-static float *energy, *d_energy;
-static float3 *d_force, *f_temp, *v_temp, *d_virial;
-static cell_list cell_list_gpu;
+
 
 // ---------------------------------------------------------------------------
 // Convert something to a string
@@ -93,8 +76,10 @@ EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **
   bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
 		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id);
 
-  ncell1D = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
-
+  ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
+  ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
+  ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
+   
   init_cell_list_const(cell_size, skin, boxlo, boxhi);
 
   return ret;
@@ -153,6 +138,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
 		    const int nall, const int ago, const bool eflag, const bool vflag, 
 		    const double *boxlo, const double *boxhi)
 {
+  cudaError_t err;
+  
   ljm.atom.nall(nall);
   ljm.atom.inum(inum);
 
@@ -161,8 +148,8 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
 
   double evdwl=0.0;
 
-  static int buffer = CELL_SIZE;
-  static int ncell = (int)pow((float)ncell1D,3);
+  static int blockSize = BLOCK_1D;
+  static int ncell = ncellx*ncelly*ncellz;
 
   static int first_call = 1;
 
@@ -176,7 +163,7 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
     cudaMalloc((void**)&d_energy,    inum*sizeof(float));
     cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
 
-    init_cell_list(cell_list_gpu, nall, ncell, buffer);
+    init_cell_list(cell_list_gpu, nall, ncell, blockSize);
 
     first_call = 0;
   }
@@ -198,13 +185,13 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
     cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
 
     clear_cell_list(cell_list_gpu);
-    init_cell_list(cell_list_gpu, nall, ncell, buffer);
+    init_cell_list(cell_list_gpu, nall, ncell, blockSize);
   }
 
   // build cell-list on GPU
   ljm.atom.time_atom.start();
   build_cell_list(host_x[0], host_type, cell_list_gpu, 
-		  ncell, ncell1D, buffer, inum, nall, ago);
+		  ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago);
   ljm.atom.time_atom.stop();
 
   ljm.time_pair.start();
@@ -216,25 +203,32 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
   cudaEventRecord(start, 0);
 #endif
 
+#define KERNEL_LJ_CELL(e, v, b, s)     kernel_lj_cell<e,v,b><<<GX, BX, s>>> \
+                                       (d_force, d_energy, d_virial, \
+					cell_list_gpu.pos, \
+					cell_list_gpu.idx, \
+					cell_list_gpu.type, \
+					cell_list_gpu.natom, \
+					inum, nall, ncell, ncellx, ncelly, ncellz); 
+
   // call the cell-list force kernel
-  const int BX=BLOCK_1D;
-  dim3 GX(ncell1D, ncell1D*ncell1D);
+  const int BX=blockSize;
+  dim3 GX(ncellx, ncelly*ncellz);
+  
   if (eflag == 0 && vflag == 0) {
-    kernel_lj_cell<false,false><<<GX, BX, 0>>>
-      (d_force, d_energy, d_virial, 
-       cell_list_gpu.pos, 
-       cell_list_gpu.idx, 
-       cell_list_gpu.type, 
-       cell_list_gpu.natom,
-       inum, nall, ncell);
+    if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64,  0);
+    if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0);
+    if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0);    
   } else {
-    kernel_lj_cell<true,true><<<GX, BX, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES>>>
-      (d_force, d_energy, d_virial, 
-       cell_list_gpu.pos, 
-       cell_list_gpu.idx, 
-       cell_list_gpu.type, 
-       cell_list_gpu.natom,
-       inum, nall, ncell);
+    if (blockSize == 64)  KERNEL_LJ_CELL(true, true, 64,  3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
+    if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
+    if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
+  }
+  
+  err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("LJ force kernel launch error: %d\n", err);
+    exit(1);
   }
 
 #ifdef TIMING
@@ -296,122 +290,6 @@ EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *
 					       ago, eflag, vflag, boxlo, boxhi);
 }
 
-template <class numtyp, class acctyp>
-double _lj_gpu_n2(LJMT &ljm, double **force, double *virial,
-		  double **host_x, int *host_type, const int inum, const int nall, const bool eflag, const bool vflag,
-		  const double *boxlo, const double *boxhi)
-{
-  ljm.atom.nall(nall);
-  ljm.atom.inum(inum);
-
-
-  ljm.nbor.time_nbor.start();
-  ljm.nbor.time_nbor.stop();
-
-  
-  double evdwl=0.0;
-
-#ifdef NOUSE
-  static int first_call = 1;
-
-  if (first_call) {
-    energy    = (float*)    malloc(inum*sizeof(float));
-    v_temp    = (float3*)   malloc(inum*2*sizeof(float3));
-    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));
-    cudaMallocHost((void**)&pos_temp, nall*sizeof(float3));
-    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
-    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
-    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
-    cudaMalloc((void**)&d_pos,       nall*sizeof(float3));
-    cudaMalloc((void**)&d_type,      nall*sizeof(int));
-    first_call = 0;
-  }
-
-
-  ljm.atom.time_atom.start();
-  double *atom_pos = host_x[0];
-  for (int i = 0; i < 3*nall; i+=3) { 
-    pos_temp[i/3] = make_float3(atom_pos[i], atom_pos[i+1], atom_pos[i+2]);
-  }
-  cudaMemcpy(d_pos, pos_temp, nall*sizeof(float3), cudaMemcpyHostToDevice);
-  cudaMemcpy(d_type, host_type, nall*sizeof(int),  cudaMemcpyHostToDevice);
-
-  ljm.atom.time_atom.stop();
-
-  ljm.time_pair.start();
-  
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=BLOCK_1D;
-  dim3 GX(static_cast<int>(ceil(static_cast<double>(inum)/BX)));
-
-#ifdef TIMING
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start, 0);
-#endif
-
-  // N^2 force kernel
-  kernel_lj_n2<numtyp, acctyp><<<GX, BX>>>(d_force, d_energy, d_virial, 
-					   d_pos, d_type, eflag, vflag, inum, nall);
-
-#ifdef TIMING
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-  float kTime;
-  cudaEventElapsedTime(&kTime, start, stop);
-  kernelTime += kTime;
-  printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-#endif
-
-  // copy results from GPU to CPU
-  cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
-  if (eflag) {
-    cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
-    for (int i = 0; i < inum; i++) {
-      evdwl += energy[i];
-    }
-    evdwl *= 0.5f;
-  }
-  if (vflag) {
-    cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
-    for (int i = 0; i < inum; i++) {
-      virial[0] += v_temp[2*i].x;
-      virial[1] += v_temp[2*i].y;
-      virial[2] += v_temp[2*i].z;
-      virial[3] += v_temp[2*i+1].x;
-      virial[4] += v_temp[2*i+1].y;
-      virial[5] += v_temp[2*i+1].z;
-    }
-    for (int i = 0; i < 6; i++) 
-      virial[i] *= 0.5f;
-  }
-
-  for (int i = 0; i < inum; i++) {
-    force[i][0] += f_temp[i].x;
-    force[i][1] += f_temp[i].y;
-    force[i][2] += f_temp[i].z;
-  }
-#endif
-  ljm.time_pair.stop();
-
-  ljm.atom.time_atom.add_to_total();
-  ljm.nbor.time_nbor.add_to_total();
-  ljm.time_pair.add_to_total();
-
-  return evdwl;
-}
-
-EXTERN double lj_gpu_n2(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
-		 const bool eflag, const bool vflag,
-		 const double *boxlo, const double *boxhi) 
-{
-  return _lj_gpu_n2<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall, 
-					  eflag, vflag, boxlo, boxhi);
-}
-
 EXTERN void lj_gpu_time() {
   cout.precision(4);
   cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
diff --git a/lib/gpu/lj_gpu_kernel.h b/lib/gpu/lj_gpu_kernel.h
index 7af33ce6de..f783616a83 100644
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@@ -21,25 +21,29 @@
 #define LJ_GPU_KERNEL
 
 /* Cell list version of LJ kernel */
-template<bool eflag, bool vflag>
+template<bool eflag, bool vflag, int blockSize>
 __global__ void kernel_lj_cell(float3 *force3,
 			       float *energy, float3 *virial, 
 			       float3 *cell_list, unsigned int *cell_idx, 
 			       int *cell_type, int *cell_atom,
-			       const int inum, const int nall, const int ncell)
+			       const int inum, const int nall, const int ncell, 
+			       const int ncellx, const int ncelly, const int ncellz)
 {
+	
+  
+	
   // calculate 3D block idx from 2d block
   int bx = blockIdx.x;
-  int by = blockIdx.y % gridDim.x;
-  int bz = blockIdx.y / gridDim.x;
+  int by = blockIdx.y % ncelly;
+  int bz = blockIdx.y / ncelly;
 
   int tid = threadIdx.x;
-
+  
   // compute cell idx from 3D block idx
-  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
-
-  __shared__ int   typeSh[CELL_SIZE];
-  __shared__ float posSh[CELL_SIZE*3];
+  int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
+  
+  __shared__ int typeSh[blockSize];
+  __shared__ float posSh[blockSize*3];
   __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -51,7 +55,7 @@ __global__ void kernel_lj_cell(float3 *force3,
   __shared__ float *offsetSh;
 
   // load force parameters into shared memory
-  for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
+  for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
     int itype = i/MAX_SHARED_TYPES;
     int jtype = i%MAX_SHARED_TYPES;
     cutsqSh[i] = _cutsq_<float>(itype,jtype);
@@ -65,7 +69,7 @@ __global__ void kernel_lj_cell(float3 *force3,
     lj3Sh = smem;
     lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
     offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
-    for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += BLOCK_1D) {
+    for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
       int itype = i/MAX_SHARED_TYPES;
       int jtype = i%MAX_SHARED_TYPES;
       lj3Sh[i]   = _lj3_<float>(itype,jtype).x+0.01;
@@ -76,41 +80,41 @@ __global__ void kernel_lj_cell(float3 *force3,
 
   __syncthreads();
 
-  int nborz0 = max(bz-1,0), nborz1 = min(bz+1, gridDim.x-1),
-    nbory0 = max(by-1,0), nbory1 = min(by+1, gridDim.x-1),
-    nborx0 = max(bx-1,0), nborx1 = min(bx+1, gridDim.x-1);
+  int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1),
+      nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1),
+      nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1);
 
-  for (int ii = 0; ii < ceil((float)(cell_atom[cid])/BLOCK_1D); ii++) {
+  for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) {
     float3 f = {0.0f, 0.0f, 0.0f};
     float ener = 0.0f;
     float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
     int itype;
     float ix, iy, iz;
-    int i = tid + ii*BLOCK_1D;
-    unsigned int answer_pos = cell_idx[cid*CELL_SIZE+i];
+    int i = tid + ii*blockSize;
+    unsigned int answer_pos = cell_idx[cid*blockSize+i];
 
     // load current cell atom position and type into sMem
-    for (int j = tid; j < cell_atom[cid]; j += BLOCK_1D) {
-      int pid = cid*CELL_SIZE + j;
+    for (int j = tid; j < cell_atom[cid]; j += blockSize) {
+      int pid = cid*blockSize + j;
       float3 pos = cell_list[pid];
       posSh[j            ] = pos.x;
-      posSh[j+  CELL_SIZE] = pos.y;
-      posSh[j+2*CELL_SIZE] = pos.z;
+      posSh[j+  blockSize] = pos.y;
+      posSh[j+2*blockSize] = pos.z;
       typeSh[j]            = cell_type[pid];
     }
     __syncthreads();
     if (answer_pos < inum) {
       itype = typeSh[i];
       ix = posSh[i            ];
-      iy = posSh[i+  CELL_SIZE];
-      iz = posSh[i+2*CELL_SIZE];
+      iy = posSh[i+  blockSize];
+      iz = posSh[i+2*blockSize];
 
       // compute force from current cell
       for (int j = 0; j < cell_atom[cid]; j++) {
 	if (j == i) continue;
 	float delx = ix - posSh[j            ];
-	float dely = iy - posSh[j+  CELL_SIZE];
-	float delz = iz - posSh[j+2*CELL_SIZE];
+	float dely = iy - posSh[j+  blockSize];
+	float delz = iz - posSh[j+2*blockSize];
 	int jtype = typeSh[j];
 	int mtype = itype + jtype*MAX_SHARED_TYPES;
 	float r2inv = delx*delx + dely*dely + delz*delz;
@@ -149,16 +153,16 @@ __global__ void kernel_lj_cell(float3 *force3,
 	  if (nborz == bz && nbory == by && nborx == bx) continue;
 	  
 	  // compute cell id
-	  int cid_nbor = nborx + INT_MUL(nbory,gridDim.x) + 
-	    INT_MUL(nborz,gridDim.x*gridDim.x);
+	  int cid_nbor = nborx + INT_MUL(nbory,ncellx) + 
+	    INT_MUL(nborz,INT_MUL(ncellx,ncelly));
 	
 	  // load neighbor cell position and type into smem
-	  for (int j = tid; j < cell_atom[cid_nbor]; j += BLOCK_1D) {
-	    int pid = INT_MUL(cid_nbor,CELL_SIZE) + j;
+	  for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) {
+	    int pid = INT_MUL(cid_nbor,blockSize) + j;
 	    float3 pos = cell_list[pid];
 	    posSh[j            ] = pos.x;
-	    posSh[j+  CELL_SIZE] = pos.y;
-	    posSh[j+2*CELL_SIZE] = pos.z;
+	    posSh[j+  blockSize] = pos.y;
+	    posSh[j+2*blockSize] = pos.z;
 	    typeSh[j]           = cell_type[pid];
 	  }
 	  __syncthreads();
@@ -166,8 +170,8 @@ __global__ void kernel_lj_cell(float3 *force3,
 	  if (answer_pos < inum) {
 	    for (int j = 0; j < cell_atom[cid_nbor]; j++) {
 	      float delx = ix - posSh[j           ];
-	      float dely = iy - posSh[j+  CELL_SIZE];
-	      float delz = iz - posSh[j+2*CELL_SIZE];
+	      float dely = iy - posSh[j+  blockSize];
+	      float delz = iz - posSh[j+2*blockSize];
 	      int jtype = typeSh[j];
 	      int mtype = itype + jtype*MAX_SHARED_TYPES;
 	      float r2inv = delx*delx + dely*dely + delz*delz;
@@ -438,116 +442,4 @@ __global__ void kernel_lj_fast(const numtyp *special_lj, const int *dev_nbor,
   } // if ii
 }
 
-
-/* Brute force O(N^2) version of LJ kernel */
-template<class numtyp, class acctyp>
-  __global__ void kernel_lj_n2(float3 *force3,
-			       float *energy, float3 *virial,
-			       float3 *pos, int *type, 
-			       const bool eflag, const bool vflag, const int inum, const int nall)
-{
-  int gid = threadIdx.x + INT_MUL(blockIdx.x, blockDim.x);
-  int tid = threadIdx.x;
-  __shared__ float posSh[BLOCK_1D*3];
-  __shared__ int   typeSh[BLOCK_1D];
-  __shared__ numtyp cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj3Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj4Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp offsetSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-
-  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=tid/MAX_SHARED_TYPES;
-    int jtype=tid%MAX_SHARED_TYPES;
-    cutsqSh[tid]=_cutsq_<numtyp>(itype,jtype);
-    lj1Sh[tid]=_lj1_<numtyp>(itype,jtype).x;
-    lj2Sh[tid]=_lj1_<numtyp>(itype,jtype).y;
-    lj3Sh[tid]=_lj3_<numtyp>(itype,jtype).x;
-    lj4Sh[tid]=_lj3_<numtyp>(itype,jtype).y;
-    offsetSh[tid]=_offset_<numtyp>(itype,jtype);
-  }
-  __syncthreads();
- 
-  float3 f = {0.0f, 0.0f, 0.0f};
-  float ener = 0.0f;
-  float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
-
-  int itype, jtype;
-  int mtype;
-
-  numtyp ix, iy, iz;
-
-  if (gid < inum) {
-    ix = pos[gid].x;
-    iy = pos[gid].y;
-    iz = pos[gid].z;
-    itype = type[gid];
-  }
-
-  int pid = tid;
-  int nIter = ceil((float)nall/BLOCK_1D);
-  for (int jj = 0; jj < nIter; jj++, pid += BLOCK_1D) {
-
-    if (pid < nall) {
-      posSh[tid           ] = pos[pid].x;
-      posSh[tid+  BLOCK_1D] = pos[pid].y;
-      posSh[tid+2*BLOCK_1D] = pos[pid].z;
-      typeSh[tid]           = type[pid];
-    }
-    __syncthreads();
-
-    if (gid < inum) {
-      int pid_j = jj*BLOCK_1D;
-      
-      for (int j = 0; j < BLOCK_1D; j++, pid_j++) {
-	if (jj == blockIdx.x && tid == j) continue;
-	if (pid_j < nall) {
-	  numtyp delx = ix - posSh[j           ];
-	  numtyp dely = iy - posSh[j+  BLOCK_1D];
-	  numtyp delz = iz - posSh[j+2*BLOCK_1D];
-	  jtype = typeSh[j];
-	  mtype = itype + jtype*MAX_SHARED_TYPES;
-	  numtyp r2inv = delx * delx + dely * dely + delz * delz;
-	  
-	  if (r2inv < cutsqSh[mtype]) {
-	    r2inv = (numtyp)1.0/r2inv;
-	    numtyp r6inv = r2inv * r2inv * r2inv;
-	    numtyp force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
-	    f.x += delx * force;
-	    f.y += dely * force;
-	    f.z += delz * force;
-	    
-	    if (eflag) {
-	      numtyp e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
-	      ener +=(e-offsetSh[mtype]);
-	    }
-	    if (vflag) {
-	      v0.x += delx*delx*force;
-	      v0.y += dely*dely*force;
-	      v0.z += delz*delz*force;
-	      v1.x += delx*dely*force;
-	      v1.y += delx*delz*force;
-	      v1.z += dely*delz*force;
-	    }
-	  }
-	}
-      }
-    }
-    
-    __syncthreads();
-  }
-
-  if (gid < inum) {
-    if (eflag)
-      energy[gid] = ener;
-    if (vflag) {
-      virial[2*gid  ] = v0;
-      virial[2*gid+1] = v1;
-    }
-    force3[gid] = f;
-  }
-
-}
-
 #endif
diff --git a/lib/gpu/lj_gpu_memory.h b/lib/gpu/lj_gpu_memory.h
index 35057bcfad..f5a407ba0b 100644
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@@ -25,8 +25,8 @@
 #include "pair_gpu_atom.h"
 #include "pair_gpu_nbor.h"
 
-#define BLOCK_1D 64
-#define CELL_SIZE 64
+#define BLOCK_1D 64           // max value = 256
+#define CELL_SIZE BLOCK_1D
 #define MAX_SHARED_TYPES 8
 #define PERCENT_GPU_MEMORY 0.7
 #define BIG_NUMBER 100000000
diff --git a/lib/gpu/pair_gpu_cell.cu b/lib/gpu/pair_gpu_cell.cu
index 11c78e87d6..2f167df15f 100644
--- a/lib/gpu/pair_gpu_cell.cu
+++ b/lib/gpu/pair_gpu_cell.cu
@@ -17,6 +17,7 @@
                          Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 
+#include <assert.h>
 #include "lj_gpu_memory.h"
 #include "pair_gpu_cell.h"
 
@@ -60,11 +61,14 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
 				       float3 *pos, 
 				       int *type, 
 				       const int inum, 
-				       const int nall)
+				       const int nall,
+				       const int cell_size)
 {
   unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
   float cSize = d_cell_size[0];
-  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
+  int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
 
   if (gid < nall) {
     float3 p = pos[gid];
@@ -75,11 +79,11 @@ __global__ void kernel_build_cell_list(float3 *cell_list,
     p.z = fmaxf(p.z, d_boxlo[2]-cSize);
     p.z = fminf(p.z, d_boxhi[2]+cSize);
 
-    int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncell1D
-    		  + (int)(p.z/cSize + 1.0) * ncell1D * ncell1D;
+    int cell_id = (int)(p.x/cSize + 1.0) + (int)(p.y/cSize + 1.0) * ncellx
+    		    + (int)(p.z/cSize + 1.0) * ncellx * ncelly;
 
     int atom_pos = atomicAdd(&cell_atom[cell_id], 1);
-    int pid = cell_id*CELL_SIZE + atom_pos;
+    int pid = cell_id*cell_size + atom_pos;
 
     cell_list[pid] = pos[gid];
     cell_type[pid] = type[gid];
@@ -92,18 +96,20 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
 {
 
   float cSize = d_cell_size[0];
-  int ncell1D = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
+  int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
 
   // calculate 3D block idx from 2d block
   int bx = blockIdx.x;
-  int by = blockIdx.y % gridDim.x;
-  int bz = blockIdx.y / gridDim.x;
+  int by = blockIdx.y % ncelly;
+  int bz = blockIdx.y / ncelly;
 
   int tid = threadIdx.x;
 
   // compute cell idx from 3D block idx
-  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
-  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
+  int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
+  int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
 
   float skin = d_skin[0];
   float lowx = d_boxlo[0] + (bx-1)*cSize - 0.5*skin;
@@ -113,7 +119,7 @@ __global__ void kernel_test_rebuild(float3 *cell_list, int *cell_atom, int *rebu
   float lowz = d_boxlo[2] + (bz-1)*cSize - 0.5*skin;
   float hiz  = lowz + cSize + skin;
 
-  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
+  for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
     int pid = pbase + i;
     float3 p = cell_list[pid];
     p.x = fmaxf(p.x, d_boxlo[0]-cSize);
@@ -136,25 +142,30 @@ __global__ void kernel_test_overflow(int *cell_atom, int *overflow, const int nc
   unsigned int gid = threadIdx.x + blockIdx.x*blockDim.x;
 
   if (gid < ncell) {
-    if (cell_atom[gid] > CELL_SIZE) 
+    if (cell_atom[gid] > blockDim.x) 
       *overflow = 1;
   }
 }
 
 __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int *cell_atom, float3 *pos)
 {
+  float cSize = d_cell_size[0];
+  int ncellx = ceil(((d_boxhi[0] - d_boxlo[0]) + 2.0f*cSize) / cSize);
+  int ncelly = ceil(((d_boxhi[1] - d_boxlo[1]) + 2.0f*cSize) / cSize);
+  int ncellz = ceil(((d_boxhi[2] - d_boxlo[2]) + 2.0f*cSize) / cSize);
+
   // calculate 3D block idx from 2d block
   int bx = blockIdx.x;
-  int by = blockIdx.y % gridDim.x;
-  int bz = blockIdx.y / gridDim.x;
+  int by = blockIdx.y % ncelly;
+  int bz = blockIdx.y / ncelly;
 
   int tid = threadIdx.x;
 
   // compute cell idx from 3D block idx
-  int cid = bx + INT_MUL(by, gridDim.x) + INT_MUL(bz, gridDim.x*gridDim.x);
-  int pbase = INT_MUL(cid,CELL_SIZE); // atom position id in cell list
+  int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
+  int pbase = INT_MUL(cid,blockDim.x); // atom position id in cell list
 
-  for (int i = tid; i < cell_atom[cid]; i += BLOCK_1D) {
+  for (int i = tid; i < cell_atom[cid]; i += blockDim.x) {
     int pid = pbase + i;
     cell_list[pid] = pos[cell_idx[pid]];
   }
@@ -164,17 +175,7 @@ __global__ void kernel_copy_list(float3 *cell_list, unsigned int *cell_idx, int
 
 __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit); 
 
-void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size)
-{
-  int i = 0;
-  const unsigned int bitSize = sizeof(unsigned int)*8;
-  const unsigned int bitStep = 4;
-  const int gSize = size/BLOCK_1D;
-  while (bitSize > i*bitStep) {
-    radixSortBlocks<<<gSize, BLOCK_1D, 2*BLOCK_1D*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
-    i++;
-  }
-}
+
 
 #ifdef __DEVICE_EMULATION__
 #define __SYNC __syncthreads();
@@ -253,7 +254,7 @@ __device__ unsigned int rank(unsigned int preds)
     unsigned int address = scan(preds);  
 
     __shared__ unsigned int numtrue;
-    if (threadIdx.x == BLOCK_1D - 1)
+    if (threadIdx.x == blockDim.x - 1)
     {
         numtrue = address + preds;
     }
@@ -266,11 +267,12 @@ __device__ unsigned int rank(unsigned int preds)
     return rank;
 }
 
+template<int blockSize>
 __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, unsigned int nbits, unsigned int startbit)
 {
   extern __shared__ unsigned int sMem1[];
-  __shared__ float sMem2[BLOCK_1D];
-  __shared__ int sMem3[BLOCK_1D];
+  __shared__ float sMem2[blockSize];
+  __shared__ int sMem3[blockSize];
 
   int tid = threadIdx.x;
 
@@ -314,7 +316,11 @@ __device__ void radixSortBlock(unsigned int *key, float3 *value1, int *value2, u
 
 }
 
-__global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values2, unsigned int nbits, unsigned int startbit)
+__global__ void radixSortBlocks(unsigned int *keys, 
+				float3 *values1, 
+				int *values2, 
+				unsigned int nbits, 
+				unsigned int startbit)
 {
 
   extern __shared__ unsigned int sMem[];
@@ -328,13 +334,30 @@ __global__ void radixSortBlocks(unsigned int *keys, float3 *values1, int *values
   value2 = values2[gid];
   __syncthreads();
 
-  radixSortBlock(&key, &value1, &value2, nbits, startbit);
+  if (blockDim.x == 64) 
+    radixSortBlock<64>(&key, &value1, &value2, nbits, startbit);
+  else if (blockDim.x == 128) 
+    radixSortBlock<128>(&key, &value1, &value2, nbits, startbit);
+  else if (blockDim.x == 256)
+    radixSortBlock<256>(&key, &value1, &value2, nbits, startbit);
 
   keys[gid] = key;
   values1[gid] = value1;
   values2[gid] = value2;
 }
 
+void sortBlocks(unsigned int *keys, float3 *values1, int *values2, const int size, int cell_size)
+{
+  int i = 0;
+  const unsigned int bitSize = sizeof(unsigned int)*8;
+  const unsigned int bitStep = 4;
+  const int gSize = size/cell_size;
+  while (bitSize > i*bitStep) {
+    radixSortBlocks<<<gSize, cell_size, 2*cell_size*sizeof(unsigned int)>>>(keys, values1, values2, bitStep, i*bitStep);
+    i++;
+  }
+}
+
 static float3 *d_pos, *pos_temp;
 static int *d_type;
 static int *d_overflow, *d_rebuild;
@@ -376,9 +399,12 @@ void clear_cell_list(cell_list &cell_list_gpu)
 
 void build_cell_list(double *atom_pos, int *atom_type, 
 		     cell_list &cell_list_gpu, 
-		     const int ncell, const int ncell1D, const int buffer,
-		     const int inum, const int nall, const int ago)
+		     const int ncell, const int ncellx, const int ncelly, const int ncellz, 
+		     const int buffer, const int inum, const int nall, const int ago)
 {
+
+  cudaError_t err;				     
+
   cudaMemset(d_overflow, 0, sizeof(int));
   cudaMemset(d_rebuild, 0, sizeof(int));
 
@@ -394,28 +420,24 @@ void build_cell_list(double *atom_pos, int *atom_type,
 
   // copy the last built cell-list and test whether it needs to be rebuilt
   if (!first_build) {
-    dim3 block(BLOCK_1D);
-    dim3 grid(ncell1D, ncell1D*ncell1D);
-    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
+    
+    dim3 grid(ncellx, ncelly*ncellz);
+    kernel_copy_list<<<grid, buffer>>>(cell_list_gpu.pos, 
 				 cell_list_gpu.idx, 
 				 cell_list_gpu.natom, d_pos);
     cudaMemset(d_rebuild, 0, sizeof(int));
     int *temp = (int*)malloc(sizeof(int)*ncell);
-    kernel_test_rebuild<<<grid, block>>>(cell_list_gpu.pos, 
+    kernel_test_rebuild<<<grid, buffer>>>(cell_list_gpu.pos, 
 					 cell_list_gpu.natom,
 					 d_rebuild);
     cudaMemcpy(&rebuild, d_rebuild, sizeof(int), cudaMemcpyDeviceToHost);
+    
+    err = cudaGetLastError();
+    assert(err == cudaSuccess);
   }
 
-  /*if (!first_build) {
-    dim3 block(BLOCK_1D);
-    dim3 grid(ncell1D, ncell1D*ncell1D);
-    kernel_copy_list<<<grid, block>>>(cell_list_gpu.pos, 
-				      cell_list_gpu.idx, 
-				      cell_list_gpu.natom, d_pos);
-				      }*/
   if (ago == 0) rebuild = 1;
-
+  
   // build cell-list for the first time
   if (first_build || rebuild) {
     first_build = 0;
@@ -431,24 +453,32 @@ void build_cell_list(double *atom_pos, int *atom_type,
 						  cell_list_gpu.idx, 
 						  cell_list_gpu.type, 
 						  cell_list_gpu.natom, 
-						  d_pos, d_type, inum, nall);
-    
+						  d_pos, d_type, inum, nall, buffer);
+    err = cudaGetLastError();
+    assert(err == cudaSuccess);
     // check cell list overflow
-    int overflow;
-    int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/BLOCK_1D));
-    kernel_test_overflow<<<gDimCell, BLOCK_1D>>>(cell_list_gpu.natom, 
-						 d_overflow, ncell);
+    int overflow = 0;
+    int gDimCell = static_cast<int>(ceil(static_cast<double>(ncell)/buffer));
+    kernel_test_overflow<<<gDimCell, buffer>>>(cell_list_gpu.natom, 
+					       d_overflow, ncell);
     cudaMemcpy(&overflow, d_overflow, sizeof(int), cudaMemcpyDeviceToHost);
+     
     if (overflow > 0) {
-      printf("\n\nBLOCK_1D too small for cell list, please increase it!\n\n");
+      printf("\n BLOCK_1D too small for cell list, please increase it!");
+      printf("\n BLOCK_1D = %d",BLOCK_1D);
+      printf("\n ncell = %d",ncell);
+      printf("\n gDimCell = %d",gDimCell);
+      printf("\n overflow = %d \n",overflow);
       exit(0);
     }
-
+    
     // sort atoms in every cell by atom index to avoid floating point associativity problem.
     sortBlocks(cell_list_gpu.idx, cell_list_gpu.pos, 
-	       cell_list_gpu.type, ncell*buffer);
+	       cell_list_gpu.type, ncell*buffer, buffer);
 
     cudaThreadSynchronize();
+    err = cudaGetLastError();
+    assert(err == cudaSuccess);
   }
 
 }
diff --git a/lib/gpu/pair_gpu_cell.h b/lib/gpu/pair_gpu_cell.h
index 072f5ffbd0..48dab9adb0 100644
--- a/lib/gpu/pair_gpu_cell.h
+++ b/lib/gpu/pair_gpu_cell.h
@@ -20,6 +20,22 @@
 #ifndef PAIR_GPU_CELL_H
 #define PAIR_GPU_CELL_H
 
+#ifdef WINDLL
+#include <windows.h>
+#endif
+
+#ifdef WINDLL
+#define EXTERN extern "C" __declspec(dllexport) 
+#else
+#define EXTERN 
+#endif
+using namespace std;
+
+static float kernelTime = 0.0;
+static int ncellx, ncelly, ncellz;
+static float *energy, *d_energy;
+static float3 *d_force, *f_temp, *v_temp, *d_virial;
+
 
 typedef struct {
   float3 *pos;
@@ -28,6 +44,8 @@ typedef struct {
   int *natom;
 } cell_list;
 
+static cell_list cell_list_gpu;
+
 __global__ void kernel_set_cell_list(unsigned int *cell_idx);
 __global__ void kernel_build_cell_list(float3 *cell_list, 
 				       unsigned int *cell_idx, 
@@ -54,8 +72,8 @@ void init_cell_list(cell_list &cell_list_gpu,
 
 void build_cell_list(double *atom_pos, int *atom_type, 
 		    cell_list &cell_list_gpu, 
-		    const int ncell, const int ncell1D, const int buffer,
-		    const int inum, const int nall, const int ago);
+		    const int ncell, const int ncellx, const int ncelly, const int ncellz, 
+		    const int buffer, const int inum, const int nall, const int ago);
 
 void clear_cell_list(cell_list &cell_list_gpu);
 
diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h
index f7eb376ea6..6d19972530 100644
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@@ -24,7 +24,6 @@
 #include "nvc_timer.h"
 #include "nvc_memory.h"
 
-#define BLOCK_1D 64
 #define IJ_SIZE 131072
 
 class PairGPUNbor {