git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6053 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-02 15:02:52 +00:00 · 2011-05-02 15:02:52 +00:00 · 5f799182b3
parent 2be078632d
commit 5f799182b3
70 changed files with 4489 additions and 2253 deletions
--- a/lib/gpu/Makefile.fermi
+++ b/lib/gpu/Makefile.fermi
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 

-CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include 
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include 
 CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON

 BIN_DIR = ./
--- a/lib/gpu/Makefile.lens
+++ b/lib/gpu/Makefile.lens
@ -17,16 +17,16 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */

-CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
+CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/
 NVCC = nvcc

 CUDA_ARCH = -arch=sm_13
-CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_PRECISION = -D_SINGLE_DOUBLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math

-CUDR_CPP = mpic++ -DMPI_GERYON -openmp
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp
 CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias

 BIN_DIR = ./
--- a/lib/gpu/Makefile.lincoln
+++ b/lib/gpu/Makefile.lincoln
@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 

-CUDR_CPP = mpic++ -DMPI_GERYON
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT 
 CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops

 BIN_DIR = ./
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math

-CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias

 BIN_DIR = ./
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */

-OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_SINGLE

--- a/lib/gpu/Makefile.longhorn
+++ b/lib/gpu/Makefile.longhorn
@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math

-CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias

 BIN_DIR = ./
--- a/lib/gpu/Makefile.mac
+++ b/lib/gpu/Makefile.mac
@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11
 CUDA_PRECISION = -D_SINGLE_SINGLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib
-CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
+CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32

 CUDR_CPP = mpic++
 CUDR_OPTS = -O2 -m32 -g
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */

-OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
+OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT
 OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE

--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -13,7 +13,8 @@
 #                                                                             
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
-#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Peng Wang (Nvidia), penwang@nvidia.com
+#                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */

@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
-NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) 
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
-          pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h pppm_gpu_memory.h

 ALL_H = $(NVD_H) $(PAIR_H)

@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices
 CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
        $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
        $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
-       $(OBJ_DIR)/charge_gpu_memory.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
+       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
+       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
       $(CUDPP)
-PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
+PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
+       $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
       $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
       $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
+       $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
+       $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
       $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
       $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
       $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
       $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
+       $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
       $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
       $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
-       $(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
+       $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
+       $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
       $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
       $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h

@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu

@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
 $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h

-$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
+$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
-	$(CUDR) -o $@ -c pair_gpu_device.cpp
+$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu
+	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu
+
+$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h
+
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h
+	$(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(CUDR) -o $@ -c atomic_gpu_memory.cpp
@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(CUDR) -o $@ -c charge_gpu_memory.cpp

+$(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
+
+$(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h
+
+$(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu
+
+$(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h
+
+$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h
+	$(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
+	$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu

@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu
+
+$(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h
+
+$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
+	$(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu

@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu
+
+$(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h
+
+$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
+	$(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu

@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)

 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -14,6 +14,7 @@
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */

@ -23,30 +24,37 @@ OCL_LIB = $(LIB_DIR)/libgpu.a
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
-          pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h pppm_gpu_memory.h

 ALL_H = $(OCL_H) $(PAIR_H)

 EXECS = $(BIN_DIR)/ocl_get_devices
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
-       $(OBJ_DIR)/charge_gpu_memory.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
+       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
+       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
-KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
+KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
+       $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
       $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
       $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
-       $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
-       $(OBJ_DIR)/crml_gpu_cl.h \
-       $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h 
-       
+       $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
+       $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
+       $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
+       $(OBJ_DIR)/cmmc_long_gpu_cl.h 
+
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices

 all: $(OCL_LIB) $(EXECS)
@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
 	$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H)
+	$(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h

-$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
+$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
+	$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
 	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
-	$(OCL) -o $@ -c pair_gpu_device.cpp
+$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h
+
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h
+	$(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(OCL) -o $@ -c atomic_gpu_memory.cpp
@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(OCL) -o $@ -c charge_gpu_memory.cpp

+$(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
+
+$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp  $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h
+	$(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
+	$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
 	$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h

@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp  $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp  $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;
+
+$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp  $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
+	$(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;

 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp  $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp  $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h;
+
+$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp  $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
+	$(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;

 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp  $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp  $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)

 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -14,6 +14,7 @@
 /* ----------------------------------------------------------------------
   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
+                         Inderaj Bains (NVIDIA), ibains@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */

--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0)  {
  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+  nbor=new PairGPUNbor();
 }

 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::~AtomicGPUMemory() {
+  delete ans;
+  delete nbor;
 }

 template <class numtyp, class acctyp>
 int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
 }

 template <class numtyp, class acctyp>
-bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                   const int max_nbors, const int maxspecial,
-                                   const double cell_size,
-                                   const double gpu_split, FILE *_screen,
-                                   const char *pair_program) {
+int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
  nbor_time_avail=false;
  screen=_screen;

@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
    gpu_nbor=true;

  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;

-  if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
-                    _gpu_host,max_nbors,cell_size,false))
-    return false;
+  _threads_per_atom=device->threads_per_atom();
+  if (_threads_per_atom>1 && gpu_nbor==false) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false);
+  if (success!=0)
+    return success;
+    
  ucl_device=device->gpu;
  atom=&device->atom;
-  nbor=&device->nbor;

-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
  compile_kernels(*ucl_device,pair_program);

  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);

  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,

  pos_tex.bind_float(atom->dev_x,4);

-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

-  return true;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void AtomicGPUMemoryT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }

 template <class numtyp, class acctyp>
@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);

  if (_compiled) {
    k_pair_fast.clear();
@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
  success=true;

  nbor_time_avail=true;
-
-  int mn=nbor->max_nbor_loop(inum,numj);
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,

  nbor->get_host(inum,ilist,numj,firstneigh,block_size());

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
  
@ -130,8 +148,8 @@ template <class numtyp, class acctyp>
 inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
                                              const int host_inum,
                                              const int nall, double **host_x,
-                                              int *host_type, double *boxlo,
-                                              double *boxhi, int *tag,
+                                              int *host_type, double *sublo,
+                                              double *subhi, int *tag,
                                              int **nspecial, int **special,
                                              bool &success) {
  nbor_time_avail=true;
@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
  atom->cast_copy_x(host_x,host_type);

  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
@ -156,24 +174,25 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
-			      const int inum_full, const int nall,
-                              double **host_x, int *host_type,
-                              int *ilist, int *numj, int **firstneigh,
-                              const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom,
-                              int &host_start, const double cpu_time,
-                              bool &success) {
+void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               int *ilist, int *numj, int **firstneigh,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, const double cpu_time,
+                               bool &success) {
  acc_timers();
  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
    zero_timers();
    return;
  }
  
  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
-		               nbor->gpu_nbor());
-  atom->inum(inum);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
  host_start=inum;

  if (ago==0) {
@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
  atom->add_x_data(host_x,host_type);

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }

@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
-                                const int inum_full, const int nall,
-                                double **host_x, int *host_type, double *boxlo,
-                                double *boxhi, int *tag, int **nspecial,
-                                int **special, const bool eflag, 
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                const double cpu_time, bool &success) {
+int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
+                                 const int nall, double **host_x, int *host_type,
+                                 double *sublo, double *subhi, int *tag,
+                                 int **nspecial, int **special, const bool eflag, 
+                                 const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum,
+                                 const double cpu_time, bool &success) {
  acc_timers();
  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
    zero_timers();
    return NULL;
  }
  
-  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
-  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
-  atom->inum(inum);
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
  host_start=inum;
 
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    boxlo, boxhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    hd_balancer.start_timer();
@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
  
-  return device->nbor.host_nbor.begin();
+  return nbor->host_jlist.begin()-host_start;
 }

 template <class numtyp, class acctyp>
 double AtomicGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(AtomicGPUMemory<numtyp,acctyp>);
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@ -18,8 +18,6 @@
 #ifndef ATOMIC_GPU_MEMORY_H
 #define ATOMIC_GPU_MEMORY_H

-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@ -39,17 +37,28 @@ class AtomicGPUMemory {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size, 
-                   const double gpu_split, FILE *screen, 
-                   const char *pair_program);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size, 
+                  const double gpu_split, FILE *screen, 
+                  const char *pair_program);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success))
+    if (atom->resize(nall, success))
      pos_tex.bind_float(atom->dev_x,4);
+    ans->resize(inum,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
@ -85,13 +94,16 @@ class AtomicGPUMemory {

  /// Accumulate timers
  inline void acc_timers() {
-    if (nbor_time_avail) {
-      nbor->time_nbor.add_to_total();
-      nbor->time_kernel.add_to_total();
-      nbor_time_avail=false;
+    if (device->time_device()) {
+      if (nbor_time_avail) {
+        nbor->time_nbor.add_to_total();
+        nbor->time_kernel.add_to_total();
+        nbor_time_avail=false;
+      }
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
    }
-    time_pair.add_to_total();
-    atom->acc_timers();
  }

  /// Zero timers
@ -99,6 +111,7 @@ class AtomicGPUMemory {
    nbor_time_avail=false;
    time_pair.zero();
    atom->zero_timers();
+    ans->zero_timers();
  }

  /// Copy neighbor list from host
@ -108,24 +121,32 @@ class AtomicGPUMemory {
  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                       double *sublo, double *subhi, int *tag, int **nspecial, 
                       int **special, bool &success);

  /// Pair loop with host neighboring
-  void compute(const int timestep, const int f_ago, const int inum_full,
+  void compute(const int f_ago, const int inum_full,
               const int nall, double **host_x, int *host_type,
               int *ilist, int *numj, int **firstneigh, const bool eflag,
               const bool vflag, const bool eatom, const bool vatom,
               int &host_start, const double cpu_time, bool &success);

  /// Pair loop with device neighboring
-  int * compute(const int timestep, const int ago, const int inum_full,
-                const int nall, double **host_x, int *host_type, double *boxlo,
-                double *boxhi, int *tag, int **nspecial,
+  int * compute(const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
                const double cpu_time, bool &success);

+  /// Pair loop with device neighboring
+  int ** compute(const int ago, const int inum_full,
+                 const int nall, double **host_x, int *host_type, double *sublo,
+                 double *subhi, int *tag, int **nspecial,
+                 int **special, const bool eflag, const bool vflag, 
+                 const bool eatom, const bool vatom, int &host_start, 
+                 int **ilist, int **numj, const double cpu_time, bool &success);
+
  // -------------------------- DEVICE DATA ------------------------- 

  /// Device Properties and Atom and Neighbor storage
@ -148,6 +169,9 @@ class AtomicGPUMemory {
  /// Atom Data
  PairGPUAtom<numtyp,acctyp> *atom;

+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;

  // --------------------------- NBOR DATA ----------------------------

@ -167,8 +191,10 @@ class AtomicGPUMemory {

 protected:
  bool _compiled;
-  int _block_size;
+  int _block_size, _threads_per_atom;
  double _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;

  void compile_kernels(UCL_Device &dev, const char *pair_string);

--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+  nbor=new PairGPUNbor();
 }

 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::~ChargeGPUMemory() {
+  delete ans;
+  delete nbor;
 }

 template <class numtyp, class acctyp>
 int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
 }

 template <class numtyp, class acctyp>
-bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                   const int max_nbors, const int maxspecial,
-                                   const double cell_size,
-                                   const double gpu_split, FILE *_screen,
-                                   const char *pair_program) {
+int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
  nbor_time_avail=false;
  screen=_screen;

@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
    gpu_nbor=true;

  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;

-  if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
-                    _gpu_host,max_nbors,cell_size,false))
-    return false;
+  _threads_per_atom=device->threads_per_charge();
+  if (_threads_per_atom>1 && gpu_nbor==false) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false);
+  if (success!=0)
+    return success;
+
  ucl_device=device->gpu;
  atom=&device->atom;
-  nbor=&device->nbor;

-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
  compile_kernels(*ucl_device,pair_program);

  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);

  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
  pos_tex.bind_float(atom->dev_x,4);
  q_tex.bind_float(atom->dev_q,1);

-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

-  return true;
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void ChargeGPUMemoryT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }

 template <class numtyp, class acctyp>
@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() {
  // Output any timing information
  acc_timers();
  double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);

  if (_compiled) {
    k_pair_fast.clear();
@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,

  nbor_time_avail=true;

-  int mn=nbor->max_nbor_loop(inum,numj);
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
  resize_atom(inum,nall,success);
  resize_local(inum,mn,success);
  if (!success)
@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,

  nbor->get_host(inum,ilist,numj,firstneigh,block_size());

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;

@ -131,8 +151,8 @@ template <class numtyp, class acctyp>
 inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
                                              const int host_inum,
                                              const int nall, double **host_x,
-                                              int *host_type, double *boxlo,
-                                              double *boxhi, int *tag, 
+                                              int *host_type, double *sublo,
+                                              double *subhi, int *tag, 
                                              int **nspecial, int **special,
                                              bool &success) {
  nbor_time_avail=true;
@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
  atom->cast_copy_x(host_x,host_type);

  int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
                        nspecial, special, success, mn);

-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
 }
@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
-                               const int inum_full, const int nall,
-                              double **host_x, int *host_type,
-                              int *ilist, int *numj, int **firstneigh,
-                              const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom,
-                              int &host_start, const double cpu_time,
-                              bool &success, double *host_q) {
+void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               int *ilist, int *numj, int **firstneigh,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, const double cpu_time,
+                               bool &success, double *host_q,
+                               const int nlocal, double *boxlo, double *prd) {
  acc_timers();
  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
    zero_timers();
    return;
  }
  
  int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
-		               nbor->gpu_nbor());
-  atom->inum(inum);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
  host_start=inum;

  if (ago==0) {
@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
  atom->cast_q_data(host_q);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);
-  atom->add_other_data();
+  atom->add_q_data();
+
+  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }

@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
-                                const int inum_full, const int nall, 
-                                double **host_x, int *host_type, double *boxlo,
-                                double *boxhi, int *tag, int **nspecial,
-                                int **special, const bool eflag, 
+int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
+                                const int nall, double **host_x, int *host_type,
+                                double *sublo, double *subhi, int *tag,
+                                int **nspecial, int **special, const bool eflag, 
                                const bool vflag, const bool eatom,
                                const bool vatom, int &host_start,
+                                int **ilist, int **jnum,
                                const double cpu_time, bool &success,
-                                double *host_q) {
+                                double *host_q, double *boxlo, double *prd) {
  acc_timers();
  if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
    zero_timers();
    return NULL;
  }
  
-  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
-  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
-  atom->inum(inum);
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
  host_start=inum;
 
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    boxlo, boxhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return NULL;
    atom->cast_q_data(host_q);
@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
    hd_balancer.start_timer();
    atom->add_x_data(host_x,host_type);
  }
-  atom->add_other_data();
+  atom->add_q_data();
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);

  loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
  hd_balancer.stop_timer();
  
-  return device->nbor.host_nbor.begin();
+  return nbor->host_jlist.begin()-host_start;
 }

 template <class numtyp, class acctyp>
 double ChargeGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(ChargeGPUMemory<numtyp,acctyp>);
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@ -18,8 +18,6 @@
 #ifndef CHARGE_GPU_MEMORY_H
 #define CHARGE_GPU_MEMORY_H

-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@ -39,19 +37,30 @@ class ChargeGPUMemory {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size,
-                   const double gpu_split, FILE *screen,
-                   const char *pair_program);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const char *pair_program);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success)) {
+    if (atom->resize(nall, success)) {
      pos_tex.bind_float(atom->dev_x,4);
      q_tex.bind_float(atom->dev_q,1);
    }
+    ans->resize(inum,success);
  }

  /// Check if there is enough storage for neighbors and realloc if not
@ -87,13 +96,16 @@ class ChargeGPUMemory {

  /// Accumulate timers
  inline void acc_timers() {
-    if (nbor_time_avail) {
-      nbor->time_nbor.add_to_total();
-      nbor->time_kernel.add_to_total();
-      nbor_time_avail=false;
+    if (device->time_device()) {
+      if (nbor_time_avail) {
+        nbor->time_nbor.add_to_total();
+        nbor->time_kernel.add_to_total();
+        nbor_time_avail=false;
+      }
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
    }
-    time_pair.add_to_total();
-    atom->acc_timers();
  }

  /// Zero timers
@ -101,6 +113,7 @@ class ChargeGPUMemory {
    nbor_time_avail=false;
    time_pair.zero();
    atom->zero_timers();
+    ans->zero_timers();
  }

  /// Copy neighbor list from host
@ -110,24 +123,25 @@ class ChargeGPUMemory {
  /// Build neighbor list on device
  void build_nbor_list(const int inum, const int host_inum,
                       const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, int *tag, int **nspecial,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
                       int **special, bool &success);

  /// Pair loop with host neighboring
-  void compute(const int timestep, const int f_ago, const int inum_full,
-               const int nall, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success,
-               double *charge);
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd);

  /// Pair loop with device neighboring
-  int * compute(const int timestep, const int ago, const int inum_full,
-                const int nall, double **host_x, int *host_type, double *boxlo,
-                double *boxhi, int *tag, int **nspecial,
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
                int **special, const bool eflag, const bool vflag, 
                const bool eatom, const bool vatom, int &host_start, 
-                const double cpu_time, bool &success, double *charge);
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);

  // -------------------------- DEVICE DATA ------------------------- 

@ -152,6 +166,10 @@ class ChargeGPUMemory {
  PairGPUAtom<numtyp,acctyp> *atom;


+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
  // --------------------------- NBOR DATA ----------------------------

  /// Neighbor data
@ -171,8 +189,10 @@ class ChargeGPUMemory {

 protected:
  bool _compiled;
-  int _block_size;
+  int _block_size, _block_bio_size, _threads_per_atom;
  double  _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;

  void compile_kernels(UCL_Device &dev, const char *pair_string);

--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@ -28,12 +28,12 @@ static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
  CMMMF.clear();
  gpu_mode=CMMMF.device->gpu_mode();
  double gpu_split=CMMMF.device->particle_split();
@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);

  CMMMF.device->world_barrier();
  if (message)
@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
+                         host_lj4, offset, special_lj, inum, nall, 300,
+                         maxspecial, cell_size, gpu_split, screen);
+
    CMMMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    CMMMF.estimate_gpu_overhead();
+  return init_ok;
 }

 void cmm_gpu_clear() {
  CMMMF.clear();
 }

-int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** cmm_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success) {
-  return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, cpu_time, success);
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
-void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success) {
-  CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }

--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef CMM_GPU_KERNEL
 #define CMM_GPU_KERNEL

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;

 #ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline

 #define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

@ -82,40 +82,56 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  if (ii<inum) {
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  if (ii<inum) {
  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -164,8 +180,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -183,49 +238,64 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,__global int *dev_nbor, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               __global numtyp* sp_lj_in,__global int *dev_nbor,
+                               __global int *dev_packed, __global acctyp4 *ans,
+                               __global acctyp *engv, const int eflag,
+                               const int vflag, const int inum, const int nall,
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
-  if (ii<4)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -273,8 +343,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@ -42,22 +42,26 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size, 
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,cmm_cut_gpu_kernel);
+int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                          int **host_cg_type, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size, 
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cmm_cut_gpu_kernel);
+  if (success!=0)
+    return success;

  // If atom type constants fit in shared memory use fast kernel
  int cmm_types=ntypes;
  shared_types=false;
-  if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    cmm_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    cmm_types=max_shared_types;
    shared_types=true;
  }
  _cmm_types=cmm_types;
@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,

  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }

 template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch);
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/cmm_cut_gpu_memory.h
+++ b/lib/gpu/cmm_cut_gpu_memory.h
@ -29,13 +29,20 @@ class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size,
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int **host_cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@ -28,14 +28,14 @@ static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                   double **host_lj1, double **host_lj2, double **host_lj3, 
-                   double **host_lj4, double **offset, double *special_lj,
-                   const int inum, const int nall, const int max_nbors, 
-                   const int maxspecial, const double cell_size, int &gpu_mode,
-                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald) {
+int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int inum, const int nall, const int max_nbors, 
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
  CMMLMF.clear();
  gpu_mode=CMMLMF.device->gpu_mode();
  double gpu_split=CMMLMF.device->particle_split();
@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                             host_lj3, host_lj4, offset, special_lj, inum, 
-                             nall, 300, maxspecial, cell_size, gpu_split, 
-                             screen, host_cut_ljsq, host_cut_coulsq,
-                             host_special_coul, qqrd2e,g_ewald);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300, 
+                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);

  CMMLMF.device->world_barrier();
  if (message)
@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                               host_lj3, host_lj4, offset, special_lj, inum, 
-                               nall, 300, maxspecial, cell_size, gpu_split,
-                               screen, host_cut_ljsq, host_cut_coulsq,
-                               host_special_coul, qqrd2e, g_ewald);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum,  nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald);
    CMMLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    CMMLMF.estimate_gpu_overhead();
+  return init_ok;
 }

 void cmml_gpu_clear() {
  CMMLMF.clear();
 }

-int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** cmml_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
-  return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success, host_q);
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q,boxlo,prd);
 }  
 			
-void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+                host_q,nlocal,boxlo,prd);
 }

 double cmml_gpu_bytes() {
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef CMML_GPU_KERNEL
 #define CMML_GPU_KERNEL

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -54,7 +52,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;

@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)

 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_ , const numtyp cut_coulsq,
-                          const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -213,8 +229,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -234,51 +291,67 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nall, const int nbor_pitch,
                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const numtyp qqrd2e, const numtyp g_ewald,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
-    lj3[ii]=lj3_in[ii];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -351,8 +424,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii

-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@ -43,26 +43,30 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                            int **host_cg_type, double **host_lj1, 
-                            double **host_lj2, double **host_lj3, 
-                            double **host_lj4, double **host_offset, 
-                            double *host_special_lj, const int nlocal,
-                            const int nall, const int max_nbors,
-                            const int maxspecial, const double cell_size,
-                            const double gpu_split, FILE *_screen,
-                            double **host_cut_ljsq, 
-                            const double host_cut_coulsq,
-                            double *host_special_coul, const double qqrd2e,
-                            const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,cmmc_long_gpu_kernel);
+int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                           int **host_cg_type, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double **host_cut_ljsq, 
+                           const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cmmc_long_gpu_kernel);
+  if (success!=0)
+    return success;

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,

  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }

 template <class numtyp, class acctyp>
@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald);
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/cmmc_long_gpu_memory.h
+++ b/lib/gpu/cmmc_long_gpu_memory.h
@ -29,15 +29,22 @@ class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, int ** cg_type,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int ** cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
--- a/lib/gpu/crml_gpu.cpp
+++ b/lib/gpu/crml_gpu.cpp
@ -28,16 +28,16 @@ static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen,
-                   double host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald, const double cut_lj_innersq,
-                   const double denom_lj, double **epsilon,
-                   double **sigma, const bool mix_arithmetic) {
+int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald, const double cut_lj_innersq,
+                  const double denom_lj, double **epsilon,
+                  double **sigma, const bool mix_arithmetic) {
  CRMLMF.clear();
  gpu_mode=CRMLMF.device->gpu_mode();
  double gpu_split=CRMLMF.device->particle_split();
@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                             host_lj4, offset, special_lj, inum, nall, 300,
-                             maxspecial, cell_size, gpu_split, screen,
-                             host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                             qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
-                             epsilon,sigma,mix_arithmetic);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
+                epsilon,sigma,mix_arithmetic);

  CRMLMF.device->world_barrier();
  if (message)
@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                               host_lj4, offset, special_lj, inum, nall, 300,
-                               maxspecial, cell_size, gpu_split,
-                               screen, host_cut_ljsq, host_cut_coulsq,
-                               host_special_coul, qqrd2e, g_ewald, 
-                               cut_lj_innersq, denom_lj, epsilon, sigma,
-                               mix_arithmetic);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum, nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
+                          sigma, mix_arithmetic);
+
    CRMLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    CRMLMF.estimate_gpu_overhead();
+  return init_ok;
 }

 void crml_gpu_clear() {
  CRMLMF.clear();
 }

-int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** crml_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
-  return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success, host_q);
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
 }  
 			
-void crml_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+void crml_gpu_compute(const int ago, const int inum_full,
+	 	                  const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+		                  const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, const int nlocal, 
+                      double *boxlo, double *prd) {
+  CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                 nlocal,boxlo,prd);
 }

 double crml_gpu_bytes() {
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@ -54,7 +54,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;

@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q)

 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_BIO_PAIR 64

 #endif

@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q)
 __inline int sbmask(int j) { return j >> SBBITS & 3; }

 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          const int lj_types, __global numtyp *sp_lj_in,
+                          __global int *dev_nbor, __global int *dev_packed,
                          __global acctyp4 *ans, __global acctyp *engv, 
                          const int eflag, const int vflag, const int inum, 
                          const int nall, const int nbor_pitch,
                          __global numtyp *q_, const numtyp cut_coulsq,
                          const numtyp qqrd2e, const numtyp g_ewald,
                          const numtyp denom_lj, const numtyp cut_bothsq, 
-                          const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
+                          const numtyp cut_ljsq, const numtyp cut_lj_innersq,
+                          const int t_per_atom) {
+
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;

-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -120,29 +125,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;

+  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -219,8 +236,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -240,50 +298,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_, const numtyp cut_coulsq, 
-                               const numtyp qqrd2e, const numtyp g_ewald,
-                               const numtyp denom_lj, const numtyp cut_bothsq, 
-                               const numtyp cut_ljsq,
-                               const numtyp cut_lj_innersq) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               __global int *dev_packed, __global acctyp4 *ans,
+                               __global acctyp *engv, const int eflag,
+                               const int vflag, const int inum, const int nall,
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const numtyp denom_lj,
+                               const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                               const numtyp cut_lj_innersq,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  ljd[ii]=ljd_in[ii];
-  ljd[ii+64]=ljd_in[ii+64];
-
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  ljd[tid]=ljd_in[tid];
+  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
+    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -366,8 +439,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
      }

    } // for nbor
+  } // if ii

-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-bool CRML_GPU_MemoryT::init(const int ntypes,
+int CRML_GPU_MemoryT::init(const int ntypes,
                           double host_cut_bothsq, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
                           const double g_ewald, const double cut_lj_innersq,
                           const double denom_lj, double **epsilon,
                           double **sigma, const bool mix_arithmetic) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,crml_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,crml_gpu_kernel);
+  if (success!=0)
+    return success;

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (this->_block_size>=64 && mix_arithmetic)
+  if (this->_block_bio_size>=64 && mix_arithmetic)
    shared_types=true;
  _lj_types=lj_types;

  // Allocate a host write buffer for data initialization
  int h_size=lj_types*lj_types;
-  if (h_size<MAX_BIO_SHARED_TYPES)
-    h_size=MAX_BIO_SHARED_TYPES;
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (h_size<max_bio_shared_types)
+    h_size=max_bio_shared_types;
  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
                               UCL_WRITE_OPTIMIZED);
  for (int i=0; i<h_size*32; i++)
@ -79,7 +83,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                         host_lj3,host_lj4);

-  ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY);
+  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
  this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);

  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,

  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }

 template <class numtyp, class acctyp>
@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const {
 template <class numtyp, class acctyp>
 void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
+  const int BX=this->_block_bio_size;
  int eflag, vflag;
  if (_eflag)
    eflag=1;
@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.set_size(GX,BX);
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
                          &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
                          &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
-                          &_cut_ljsq, &_cut_lj_innersq);
+                          &_cut_ljsq, &_cut_lj_innersq, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/crml_gpu_memory.h
+++ b/lib/gpu/crml_gpu_memory.h
@ -29,17 +29,24 @@ class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double host_cut_bothsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald,
-            const double cut_lj_innersq, const double denom_lj, 
-            double **epsilon, double **sigma, const bool mix_arithmetic);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald,
+           const double cut_lj_innersq, const double denom_lj, 
+           double **epsilon, double **sigma, const bool mix_arithmetic);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@ -49,14 +49,14 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool gb_gpu_init(const int ntypes, const double gamma,
-                 const double upsilon, const double mu, double **shape,
-                 double **well, double **cutsq, double **sigma,
-                 double **epsilon, double *host_lshape, int **form,
-                 double **host_lj1, double **host_lj2, double **host_lj3,
-                 double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+int gb_gpu_init(const int ntypes, const double gamma,
+                const double upsilon, const double mu, double **shape,
+                double **well, double **cutsq, double **sigma,
+                double **epsilon, double *host_lshape, int **form,
+                double **host_lj1, double **host_lj2, double **host_lj3,
+                double **host_lj4, double **offset, double *special_lj,
+                const int inum, const int nall, const int max_nbors, 
+                const double cell_size, int &gpu_mode, FILE *screen) {
  GBMF.clear();
  gpu_mode=GBMF.device->gpu_mode();
  double gpu_split=GBMF.device->particle_split();
@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                           sigma, epsilon, host_lshape, form, host_lj1, 
-                           host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                           inum, nall, max_nbors, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+                      sigma, epsilon, host_lshape, form, host_lj1, 
+                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
+                      inum, nall, max_nbors, cell_size, gpu_split, screen);

  GBMF.device->world_barrier();
  if (message)
@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                             sigma, epsilon, host_lshape, form, host_lj1, 
-                             host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                             inum, nall, max_nbors, cell_size, gpu_split, 
-                             screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
+                        epsilon, host_lshape, form, host_lj1, host_lj2,
+                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
+                        max_nbors, cell_size, gpu_split,  screen);
+
    GBMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    GBMF.estimate_gpu_overhead();
+  return init_ok;
 }

 // ---------------------------------------------------------------------------
@ -131,8 +129,8 @@ template <class gbmtyp>
 inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
                                    const int host_inum, const int nall, 
                                    double **host_x, double **host_quat,
-                                    int *host_type, double *boxlo,
-                                    double *boxhi, bool &success) {
+                                    int *host_type, double *sublo,
+                                    double *subhi, bool &success) {
  gbm.nbor_time_avail=true;

  success=true;
@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
  gbm.atom->cast_copy_x(host_x,host_type);
  int mn;
  gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
-                            boxlo, boxhi, NULL, NULL, NULL, success, mn);
+                            sublo, subhi, NULL, NULL, NULL, success, mn);
  gbm.nbor->copy_unpacked(inum,mn);
  gbm.last_ellipse=inum;
  gbm.max_last_ellipse=inum;
@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
    
  gbm.nbor_time_avail=true;

-  int mn=gbm.nbor->max_nbor_loop(inum,numj);
+  int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
  gbm.resize_atom(inum,nall,success);
  gbm.resize_local(inum,0,mn,osize,success);
  if (!success)
@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
+                               (BX/gbm._threads_per_atom)));
  int stride=gbm.nbor->nbor_pitch();
-  int ainum=gbm.atom->inum();
+  int ainum=gbm.ans->inum();
  int anall=gbm.atom->nall();

  if (gbm.multiple_forms) {
@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
    if (gbm.last_ellipse>0) {
      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
-                               static_cast<double>(BX)));
+                               (BX/gbm._threads_per_atom)));
      gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
 			ELLIPSE_ELLIPSE);
      gbm.time_kernel.stop();
@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
           &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
           &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
           &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-           &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
-           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
+           &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
+           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
+           &gbm._threads_per_atom);
      gbm.time_gayberne.stop();

-      if (gbm.last_ellipse==gbm.atom->inum()) {
+      if (gbm.last_ellipse==gbm.ans->inum()) {
        gbm.time_kernel2.start();
        gbm.time_kernel2.stop();
        gbm.time_gayberne2.start();
@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
      // ------------ SPHERE_ELLIPSE ---------------

      gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
-                               gbm.last_ellipse)/BX));
-      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
+                               gbm.last_ellipse)/
+                               (BX/gbm._threads_per_atom)));
+      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
 			SPHERE_ELLIPSE,SPHERE_ELLIPSE);
      gbm.time_kernel2.stop();

@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
              &gbm.shape.begin(), &gbm.well.begin(), 
              &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
              &gbm._lj_types, &gbm.lshape.begin(), 
-              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
-              &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
-              &vflag, &gbm.last_ellipse, &ainum, &anall);
+              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
+              &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
+              &vflag, &gbm.last_ellipse, &ainum, &anall,
+              &gbm._threads_per_atom);
      gbm.time_gayberne2.stop();
   } else {
-      gbm.atom->dev_ans.zero();
-      gbm.atom->dev_engv.zero();
+      gbm.ans->dev_ans.zero();
+      gbm.ans->dev_engv.zero();
      gbm.time_kernel.stop();
      gbm.time_gayberne.start();                                 
      gbm.time_gayberne.stop();
@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
    
    // ------------         LJ      ---------------
    gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.atom->inum()) {
+    if (gbm.last_ellipse<gbm.ans->inum()) {
      if (gbm.shared_types) {
        GBMF.k_lj_fast.set_size(GX,BX);
        GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                           &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
                           &stride, &gbm.nbor->dev_packed.begin(),
-                           &gbm.atom->dev_ans.begin(),
-                           &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
-                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+                           &gbm.ans->dev_ans.begin(),
+                           &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
+                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
+                           &gbm._threads_per_atom);
      } else {
        GBMF.k_lj.set_size(GX,BX);
        GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                      &gbm.lj3.begin(), &gbm._lj_types, 
                      &gbm.gamma_upsilon_mu.begin(), &stride, 
-                      &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
-                      &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
-                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+                      &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
+                      &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
+                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
+                      &gbm._threads_per_atom);
      }
    }
    gbm.time_pair.stop();
  } else {
    gbm.time_kernel.start();
-    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
+    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
 		      ELLIPSE_ELLIPSE);
    gbm.time_kernel.stop();
    gbm.time_gayberne.start(); 
@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
            &gbm.shape.begin(), &gbm.well.begin(), 
            &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
            &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-            &stride, &gbm.atom->dev_ans.begin(), &ainum,
-            &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
-            &eflag, &vflag, &ainum, &anall);
+            &stride, &gbm.ans->dev_ans.begin(), &ainum,
+            &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
+            &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
    gbm.time_gayberne.stop();
  }
 }
@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
 // Reneighbor on GPU if necessary and then compute forces, torques, energies
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
-		               const int inum_full, const int nall,
-			       double **host_x, int *host_type,
-			       double *boxlo, double *boxhi, const bool eflag,
-			       const bool vflag, const bool eatom,
+inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
+                               const int inum_full, const int nall,
+                               double **host_x, int *host_type,
+                               double *sublo, double *subhi, const bool eflag,
+                               const bool vflag, const bool eatom,
                               const bool vatom, int &host_start,
-		               const double cpu_time, bool &success,
-			       double **host_quat) {
+                               int **ilist, int **jnum, const double cpu_time,
+                               bool &success, double **host_quat) {
  gbm.acc_timers();
  if (inum_full==0) {
+    host_start=0;
    gbm.zero_timers();
    return NULL;
  }

-  gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
-  int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
-  gbm.atom->inum(inum);
+  gbm.hd_balancer.balance(cpu_time);
+  int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
+  gbm.ans->inum(inum);
  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
  host_start=inum;
  
  // Build neighbor list on GPU if necessary
  if (ago==0) {
    _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
-                            host_quat, host_type, boxlo, boxhi, success);
+                            host_quat, host_type, sublo, subhi, success);
    if (!success)
      return NULL;
    gbm.atom->cast_quat_data(host_quat[0]);
@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
    gbm.atom->add_x_data(host_x,host_type);
  }

-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();
+  *ilist=gbm.nbor->host_ilist.begin();
+  *jnum=gbm.nbor->host_acc.begin();

  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.device->add_ans_object(gbm.ans);
  gbm.hd_balancer.stop_timer();
-  return gbm.device->nbor.host_nbor.begin();
+  return gbm.nbor->host_jlist.begin()-host_start;
 }

-int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
-	 	       const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, const bool eflag,
-		       const bool vflag, const bool eatom, const bool vatom,
-                       int &host_start, const double cpu_time, bool &success,
-		       double **host_quat) {
-  return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
-			   host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
-                           host_start, cpu_time, success, host_quat);
+int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, double **host_quat) {
+  return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
+                           subhi, eflag, vflag, eatom, vatom, host_start, ilist,
+                           jnum, cpu_time, success, host_quat);
 }  

 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, torques,..
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
-			     const int inum_full,const int nall,double **host_x,
-			     int *host_type, int *ilist, int *numj,
-			     int **firstneigh, const bool eflag,
-			     const bool vflag, const bool eatom,
-                             const bool vatom, int &host_start,
-			     const double cpu_time, bool &success,
-			     double **host_quat) {
+inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
+                             const int nall,double **host_x, int *host_type,
+                             int *ilist, int *numj, int **firstneigh,
+                             const bool eflag, const bool vflag,
+                             const bool eatom, const bool vatom,
+                             int &host_start, const double cpu_time,
+                             bool &success, double **host_quat) {
  gbm.acc_timers();
  if (inum_full==0) {
+    host_start=0;
    gbm.zero_timers();
    return NULL;
  }
  
  int ago=gbm.hd_balancer.ago_first(f_ago);
-  int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
-				   gbm.nbor->gpu_nbor());
-  gbm.atom->inum(inum);
+  int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
+  gbm.ans->inum(inum);
  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
  host_start=inum;

@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
  gbm.atom->cast_quat_data(host_quat[0]);
  gbm.hd_balancer.start_timer();
  gbm.atom->add_x_data(host_x,host_type);
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();

  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.device->add_ans_object(gbm.ans);
  gbm.hd_balancer.stop_timer();
  return list;
 }

-int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double **host_quat) {
-  return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
+int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double **host_quat) {
+  return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
 			 host_type, ilist, numj, firstneigh, eflag, vflag,
 			 eatom, vatom, host_start, cpu_time, success,
                         host_quat);
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@ -18,7 +18,6 @@
 #ifndef GB_GPU_EXTRA_H
 #define GB_GPU_EXTRA_H

-#define MAX_SHARED_TYPES 8
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

 #ifdef _DOUBLE_DOUBLE
@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"

 #else

@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #define BLOCK_SIZE_X get_local_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define __inline inline
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
                              __global acctyp4 *ans, const int astride, 
                              __global acctyp *engv, __global int *err_flag, 
                              const int eflag, const int vflag, const int inum,
-                              const int nall) {
+                              const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[4];
-
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);                                  
-  __syncthreads();
-
-  if (ii<inum) {
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    

  acctyp energy=(acctyp)0;
  acctyp4 f;
@ -121,262 +121,309 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
  acctyp virial[6];
  for (int i=0; i<6; i++)
    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
  
-  __global int *nbor=dev_nbor+ii;
-  int i=*nbor;
-  nbor+=stride;
-  int numj=*nbor;
-  nbor+=stride;
-  __global int *nbor_end=nbor+mul24(stride,numj);
-  
-  numtyp4 ix=x_[i];
-  int itype=ix.w;
-  numtyp a1[9], b1[9], g1[9];
-  numtyp4 ishape=shape[itype];
-  {
-    numtyp t[9];
-    gpu_quat_to_mat_trans(q,i,a1);
-    gpu_times3(ishape,a1,t);
-    gpu_transpose_times3(a1,t,g1);
-    gpu_times3(well[itype],a1,t);
-    gpu_transpose_times3(a1,t,b1);
-  }
-
-  numtyp factor_lj;
-  for ( ; nbor<nbor_end; nbor+=stride) {
-
-  int j=*nbor;
-  factor_lj = sp_lj[sbmask(j)];
-  j &= NEIGHMASK;
-
-  numtyp4 jx=x_[j];
-  int jtype=jx.w;
-
-  // Compute r12
-  numtyp r12[3];
-  r12[0] = jx.x-ix.x;
-  r12[1] = jx.y-ix.y;
-  r12[2] = jx.z-ix.z;
-  numtyp ir = gpu_dot3(r12,r12);
-
-  ir = rsqrt(ir);
-  numtyp r = (numtyp)1.0/ir;
-
-  numtyp a2[9];
-  gpu_quat_to_mat_trans(q,j,a2);
-  
-  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
-  { // Compute U_r, dUr, eta, and teta
-    // Compute g12
-    numtyp g12[9];
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+    numtyp a1[9], b1[9], g1[9];
+    numtyp4 ishape=shape[itype];
    {
-      numtyp g2[9];
-      {
-          gpu_times3(shape[jtype],a2,g12);
-          gpu_transpose_times3(a2,g12,g2);
-          gpu_plus3(g1,g2,g12);
+      numtyp t[9];
+      gpu_quat_to_mat_trans(q,i,a1);
+      gpu_times3(ishape,a1,t);
+      gpu_transpose_times3(a1,t,g1);
+      gpu_times3(well[itype],a1,t);
+      gpu_transpose_times3(a1,t,b1);
+    }
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = rsqrt(ir);
+      numtyp r = (numtyp)1.0/ir;
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(q,j,a2);
+  
+      numtyp u_r, dUr[3], tUr[3], eta, teta[3];
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          numtyp g2[9];
+          {
+              gpu_times3(shape[jtype],a2,g12);
+              gpu_transpose_times3(a2,g12,g2);
+              gpu_plus3(g1,g2,g12);
+          }
+
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- replace r12 with r12 hat
+            r12[0]*=ir;
+            r12[1]*=ir;
+            r12[2]*=ir;
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12,kappa);
+              sigma12 = rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              int mtype=mul24(ntypes,itype)+jtype;
+              numtyp sigma = sig_eps[mtype].x;
+              numtyp epsilon = sig_eps[mtype].y;
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
+              dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
+              dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
+            }
+
+            // torque for particle 1
+            {
+              numtyp tempv[3], tempv2[3];
+              tempv[0] = -uslj_rsq*kappa[0];
+              tempv[1] = -uslj_rsq*kappa[1];
+              tempv[2] = -uslj_rsq*kappa[2];
+              gpu_row_times3(kappa,g1,tempv2);
+              gpu_cross3(tempv,tempv2,tUr);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+          numtyp det_g12 = gpu_det3(g12);
+          eta = pow(eta/det_g12,gum[1]);
+        }
+    
+        // Compute teta
+        numtyp temp[9], tempv[3], tempv2[3];
+        compute_eta_torque(g12,a1,ishape,temp);
+        numtyp temp1 = -eta*gum[1];
+
+        tempv[0] = temp1*temp[0];
+        tempv[1] = temp1*temp[1];
+        tempv[2] = temp1*temp[2];
+        gpu_cross3(a1,tempv,tempv2);
+        teta[0] = tempv2[0];
+        teta[1] = tempv2[1];
+        teta[2] = tempv2[2];
+  
+        tempv[0] = temp1*temp[3];
+        tempv[1] = temp1*temp[4];
+        tempv[2] = temp1*temp[5];
+        gpu_cross3(a1+3,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
+
+        tempv[0] = temp1*temp[6];
+        tempv[1] = temp1*temp[7];
+        tempv[2] = temp1*temp[8];
+        gpu_cross3(a1+6,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
      }
  
-      { // Compute U_r and dUr
-    
-        // Compute kappa
-        numtyp kappa[3];
-        gpu_mldivide3(g12,r12,kappa,err_flag);
+      numtyp chi, dchi[3], tchi[3];
+      { // Compute chi and dchi

-        // -- replace r12 with r12 hat
+        // Compute b12
+        numtyp b2[9], b12[9];
+        {
+          gpu_times3(well[jtype],a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          gpu_plus3(b1,b2,b12);
+        }
+
+        // compute chi_12
+        r12[0]*=r;
+        r12[1]*=r;
+        r12[2]*=r;
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
        r12[0]*=ir;
        r12[1]*=ir;
        r12[2]*=ir;
+        chi = gpu_dot3(r12,iota);
+        chi = pow(chi*(numtyp)2.0,gum[2]);

-        // -- kappa is now / r
-        kappa[0]*=ir;
-        kappa[1]*=ir;
-        kappa[2]*=ir;
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
+                                                          gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12[2]);
+
+        // compute t_chi
+        numtyp tempv[3];
+        gpu_row_times3(iota,b1,tempv);
+        gpu_cross3(tempv,iota,tchi);
+        temp1 = (numtyp)-4.0*ir*ir;
+        tchi[0] *= temp1;
+        tchi[1] *= temp1;
+        tchi[2] *= temp1;
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag>0)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag>0) {
+        r12[0]*=-r;
+        r12[1]*=-r;
+        r12[2]*=-r;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        f.x+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        f.y+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        f.z+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        f.x+=temp1*dchi[0]-temp2*dUr[0];
+        f.y+=temp1*dchi[1]-temp2*dUr[1];
+        f.z+=temp1*dchi[2]-temp2*dUr[2];
+      }
+
+      // Torque on 1
+      temp1 = -u_r*eta*factor_lj;
+      temp2 = -u_r*chi*factor_lj;
+      numtyp temp3 = -chi*eta*factor_lj;
+      tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
+      tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
+      tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
+ 
+    } // for nbor
+  } // if ii
  
-        // energy
-  
-        // compute u_r and dUr
-        numtyp uslj_rsq;
-        {
-          // Compute distance of closest approach
-          numtyp h12, sigma12;
-          sigma12 = gpu_dot3(r12,kappa);
-          sigma12 = rsqrt((numtyp)0.5*sigma12);
-          h12 = r-sigma12;
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[7][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=tor.x;
+    red_acc[4][tid]=tor.y;
+    red_acc[5][tid]=tor.z;

-          // -- kappa is now ok
-          kappa[0]*=r;
-          kappa[1]*=r;
-          kappa[2]*=r;
-          
-          int mtype=mul24(ntypes,itype)+jtype;
-          numtyp sigma = sig_eps[mtype].x;
-          numtyp epsilon = sig_eps[mtype].y;
-          numtyp varrho = sigma/(h12+gum[0]*sigma);
-          numtyp varrho6 = varrho*varrho*varrho;
-          varrho6*=varrho6;
-          numtyp varrho12 = varrho6*varrho6;
-          u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-          numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-          temp1 = temp1*(numtyp)24.0*epsilon;
-          uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-          numtyp temp2 = gpu_dot3(kappa,r12);
-          uslj_rsq = uslj_rsq*ir*ir;
-
-          dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
-          dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
-          dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
-        }
-
-        // torque for particle 1
-        {
-          numtyp tempv[3], tempv2[3];
-          tempv[0] = -uslj_rsq*kappa[0];
-          tempv[1] = -uslj_rsq*kappa[1];
-          tempv[2] = -uslj_rsq*kappa[2];
-          gpu_row_times3(kappa,g1,tempv2);
-          gpu_cross3(tempv,tempv2,tUr);
-        }
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
      }
    }
-     
-    // Compute eta
-    {
-      eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
-      numtyp det_g12 = gpu_det3(g12);
-      eta = pow(eta/det_g12,gum[1]);
-    }
    
-    // Compute teta
-    numtyp temp[9], tempv[3], tempv2[3];
-    compute_eta_torque(g12,a1,ishape,temp);
-    numtyp temp1 = -eta*gum[1];
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    tor.x=red_acc[3][tid];
+    tor.y=red_acc[4][tid];
+    tor.z=red_acc[5][tid];

-    tempv[0] = temp1*temp[0];
-    tempv[1] = temp1*temp[1];
-    tempv[2] = temp1*temp[2];
-    gpu_cross3(a1,tempv,tempv2);
-    teta[0] = tempv2[0];
-    teta[1] = tempv2[1];
-    teta[2] = tempv2[2];
-  
-    tempv[0] = temp1*temp[3];
-    tempv[1] = temp1*temp[4];
-    tempv[2] = temp1*temp[5];
-    gpu_cross3(a1+3,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
+    if (eflag>0 || vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;

-    tempv[0] = temp1*temp[6];
-    tempv[1] = temp1*temp[7];
-    tempv[2] = temp1*temp[8];
-    gpu_cross3(a1+6,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
-  }
-  
-  numtyp chi, dchi[3], tchi[3];
-  { // Compute chi and dchi
-
-    // Compute b12
-    numtyp b2[9], b12[9];
-    {
-      gpu_times3(well[jtype],a2,b12);
-      gpu_transpose_times3(a2,b12,b2);
-      gpu_plus3(b1,b2,b12);
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<7; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
    }
-
-    // compute chi_12
-    r12[0]*=r;
-    r12[1]*=r;
-    r12[2]*=r;
-    numtyp iota[3];
-    gpu_mldivide3(b12,r12,iota,err_flag);
-    // -- iota is now iota/r
-    iota[0]*=ir;
-    iota[1]*=ir;
-    iota[2]*=ir;
-    r12[0]*=ir;
-    r12[1]*=ir;
-    r12[2]*=ir;
-    chi = gpu_dot3(r12,iota);
-    chi = pow(chi*(numtyp)2.0,gum[2]);
-
-    // -- iota is now ok
-    iota[0]*=r;
-    iota[1]*=r;
-    iota[2]*=r;
-
-    numtyp temp1 = gpu_dot3(iota,r12);
-    numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
-                                                      gum[2]);
-    dchi[0] = temp2*(iota[0]-temp1*r12[0]);
-    dchi[1] = temp2*(iota[1]-temp1*r12[1]);
-    dchi[2] = temp2*(iota[2]-temp1*r12[2]);
-
-    // compute t_chi
-    numtyp tempv[3];
-    gpu_row_times3(iota,b1,tempv);
-    gpu_cross3(tempv,iota,tchi);
-    temp1 = (numtyp)-4.0*ir*ir;
-    tchi[0] *= temp1;
-    tchi[1] *= temp1;
-    tchi[2] *= temp1;
  }

-  numtyp temp2 = factor_lj*eta*chi;
-  if (eflag>0)
-    energy+=u_r*temp2;
-  numtyp temp1 = -eta*u_r*factor_lj;
-  if (vflag>0) {
-    r12[0]*=-r;
-    r12[1]*=-r;
-    r12[2]*=-r;
-    numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-    f.x+=ft;
-    virial[0]+=r12[0]*ft;
-    ft=temp1*dchi[1]-temp2*dUr[1];
-    f.y+=ft;
-    virial[1]+=r12[1]*ft;
-    virial[3]+=r12[0]*ft;
-    ft=temp1*dchi[2]-temp2*dUr[2];
-    f.z+=ft;
-    virial[2]+=r12[2]*ft;
-    virial[4]+=r12[0]*ft;
-    virial[5]+=r12[1]*ft;
-  } else {
-    f.x+=temp1*dchi[0]-temp2*dUr[0];
-    f.y+=temp1*dchi[1]-temp2*dUr[1];
-    f.z+=temp1*dchi[2]-temp2*dUr[2];
-  }
-
-  // Torque on 1
-  temp1 = -u_r*eta*factor_lj;
-  temp2 = -u_r*chi*factor_lj;
-  numtyp temp3 = -chi*eta*factor_lj;
-  tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
-  tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
-  tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
-
-  } // for nbor
-
  // Store answers
-  __global acctyp *ap1=engv+ii;
-  if (eflag>0) {
-    *ap1=energy;
-    ap1+=astride;
-  }
-  if (vflag>0) {
-    for (int i=0; i<6; i++) {
-      *ap1=virial[i];
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
      ap1+=astride;
    }
-  }
-  ans[ii]=f;
-  ans[ii+astride]=tor;
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=astride;
+      }
+    }
+    ans[ii]=f;
+    ans[ii+astride]=tor;
  } // if ii
 }

--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@ -34,33 +34,36 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               __global int *err_flag, const int eflag, 
                               const int vflag,const int start, const int inum, 
-                               const int nall) {
-  __local numtyp sp_lj[4];
+                               const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;

-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
-  __syncthreads();
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;

  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=stride;
    int numj=*nbor;
    nbor+=stride;
    __global int *nbor_end=nbor+stride*numj;
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
  
    numtyp4 ix=x_[i];
    int itype=ix.w;
@ -69,7 +72,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
    numtyp one_well=well[itype].x;
  
    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -241,8 +244,47 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
        f.z+=temp1*dchi[2]-temp2*dUr[2];
      }
    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -265,39 +307,42 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                        __global acctyp4 *ans, __global acctyp *engv, 
                        __global int *err_flag, const int eflag, 
                        const int vflag, const int start, const int inum, 
-                        const int nall) {
-  __local numtyp sp_lj[4];                              
-  
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
-  __syncthreads();
+                        const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;

+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_ij+ii;
    int i=*nbor;
    nbor+=stride;
    int numj=*nbor;
    nbor+=stride;
    __global int *list_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
  
    numtyp4 ix=x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=stride) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -338,8 +383,47 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1+=energy;
@ -361,50 +445,54 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
                             __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, 
-                             __global int *dev_ij, __global acctyp4 *ans, 
-                             __global acctyp *engv, __global int *err_flag,
-                             const int eflag,const int vflag, const int start,
-                             const int inum, const int nall) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                             const int stride, __global int *dev_ij,
+                             __global acctyp4 *ans, __global acctyp *engv,
+                             __global int *err_flag, const int eflag,
+                             const int vflag, const int start, const int inum,
+                             const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[4];                              
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=gum[tid+3];    
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_ij+ii;
    int i=*nbor;
    nbor+=stride;
    int numj=*nbor;
    nbor+=stride;
    __global int *list_end=nbor+mul24(stride,numj);
-  
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+
    numtyp4 ix=x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=stride) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -443,8 +531,47 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1+=energy;
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@ -18,8 +18,6 @@
 #ifndef PAIR_GPU_KERNEL_H
 #define PAIR_GPU_KERNEL_H

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -32,7 +30,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"

 #else

@ -42,6 +40,7 @@
 #define BLOCK_ID_X get_group_id(0)
 #define BLOCK_SIZE_X get_local_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define MAX_SHARED_TYPES 8

 #endif

--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@ -32,30 +32,35 @@ template <class numtyp, class acctyp>
 GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
                                  _max_bytes(0.0) {
  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+  nbor=new PairGPUNbor;
 }

 template <class numtyp, class acctyp>
 GB_GPU_MemoryT::~GB_GPU_Memory() { 
  clear();
+  delete ans;
+  delete nbor;
 }
 
 template <class numtyp, class acctyp>
 int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
 }

 template <class numtyp, class acctyp>
-bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
-                          const double upsilon, const double mu, 
-                          double **host_shape, double **host_well, 
-                          double **host_cutsq, double **host_sigma, 
-                          double **host_epsilon, double *host_lshape, 
-                          int **h_form, double **host_lj1, double **host_lj2,
-                          double **host_lj3, double **host_lj4,
-                          double **host_offset, const double *host_special_lj,
-                          const int nlocal, const int nall,
-                          const int max_nbors, const double cell_size,
-                          const double gpu_split, FILE *_screen) {
+int GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
+                         const double upsilon, const double mu, 
+                         double **host_shape, double **host_well, 
+                         double **host_cutsq, double **host_sigma, 
+                         double **host_epsilon, double *host_lshape, 
+                         int **h_form, double **host_lj1, double **host_lj2,
+                         double **host_lj3, double **host_lj4,
+                         double **host_offset, const double *host_special_lj,
+                         const int nlocal, const int nall,
+                         const int max_nbors, const double cell_size,
+                         const double gpu_split, FILE *_screen) {
  nbor_time_avail=false;
  screen=_screen;

@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
    gpu_nbor=true;

  int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
  if (host_nlocal>0)
    _gpu_host=1;
  
-  if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
-                    max_nbors,cell_size,true))
-    return false;
+  _threads_per_atom=device->threads_per_atom();
+  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
+                           _gpu_host,max_nbors,cell_size,true);
+  if (success!=0)
+    return success;
+    
  ucl_device=device->gpu;
  atom=&device->atom;
-  nbor=&device->nbor;

-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
  compile_kernels(*ucl_device);

  // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);

  // Initialize timers for the selected GPU
  time_pair.init(*ucl_device);
@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=device->max_shared_types();
+  if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
+    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
  }
  
  if (multiple_forms)
-    atom->dev_ans.zero();
+    ans->dev_ans.zero();

-  _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

  // Memory for ilist ordered by particle type
-  return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
+  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
+    return 0;
+  else return -3;
+}
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
 }

 template <class numtyp, class acctyp>
@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() {

  // Output any timing information
  acc_timers();
-  double single[6], times[6];
+  double single[9], times[9];

-  single[0]=atom->transfer_time();
+  single[0]=atom->transfer_time()+ans->transfer_time();
  single[1]=nbor->time_nbor.total_seconds();
  single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
            nbor->time_kernel.total_seconds();
@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() {
    single[4]=time_pair.total_seconds();
  else
    single[4]=0;
-  single[5]=atom->cast_time();
+  single[5]=atom->cast_time()+ans->cast_time();
+  single[6]=_gpu_overhead;
+  single[7]=_driver_overhead;
+  single[8]=ans->cpu_idle_time();

-  MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
+  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
  double avg_split=hd_balancer.all_avg_split();

  _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
              sigma_epsilon.row_bytes()+cut_form.row_bytes()+
              shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
-              gamma_upsilon_mu.row_bytes();
+              gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
  double mpi_max_bytes;
  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
             device->replica());
@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() {
        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
      }
+      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
+      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
      fprintf(screen,"-------------------------------------");
      fprintf(screen,"--------------------------------\n\n");
+
+
+      fprintf(screen,"Average split:   %.4f.\n",avg_split);
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+
+
    }
  _max_bytes=0.0;

@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() {

 template <class numtyp, class acctyp>
 double GB_GPU_MemoryT::host_memory_usage() const {
-  return device->atom.host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(GB_GPU_Memory<numtyp,acctyp>)+
-         device->nbor.max_atoms()*sizeof(int);
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
+         nbor->max_atoms()*sizeof(int);
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -18,8 +18,6 @@
 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H

-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@ -35,23 +33,34 @@ class GB_GPU_Memory {
    * \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
    * \param gpu_split fraction of particles handled by device 
-    * \return false if there is not sufficient memory or device init prob **/
-  bool init(const int ntypes, const double gamma,
-            const double upsilon, const double mu, double **host_shape,
-            double **host_well, double **host_cutsq, double **host_sigma, 
-            double **host_epsilon, double *host_lshape, int **h_form,
-            double **host_lj1, double **host_lj2, double **host_lj3, 
-            double **host_lj4, double **host_offset, 
-            const double *host_special_lj, const int nlocal, const int nall, 
-            const int max_nbors, const double cell_size,
-            const double gpu_split, FILE *screen);
+    * \return false if there is not sufficient memory or device init prob
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const double gamma,
+           const double upsilon, const double mu, double **host_shape,
+           double **host_well, double **host_cutsq, double **host_sigma, 
+           double **host_epsilon, double *host_lshape, int **h_form,
+           double **host_lj1, double **host_lj2, double **host_lj3, 
+           double **host_lj4, double **host_offset, 
+           const double *host_special_lj, const int nlocal, const int nall, 
+           const int max_nbors, const double cell_size,
+           const double gpu_split, FILE *screen);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
  inline void resize_atom(const int inum, const int nall, bool &success) {
-    atom->resize(inum, nall, success);
-    if (multiple_forms) atom->dev_ans.zero();
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    atom->resize(nall, success);
+    ans->resize(inum, success);
+    if (multiple_forms) ans->dev_ans.zero();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
    if (bytes>_max_bytes)
      _max_bytes=bytes;
  }
@ -74,7 +83,7 @@ class GB_GPU_Memory {
      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
    }
    nbor->resize(nlocal,host_inum,max_nbors,success);
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
    if (bytes>_max_bytes)
      _max_bytes=bytes;
  }
@ -91,19 +100,22 @@ class GB_GPU_Memory {

  /// Accumulate timers
  inline void acc_timers() {
-    if (nbor_time_avail) {
-      nbor->time_nbor.add_to_total();
-      nbor->time_kernel.add_to_total();
-      nbor_time_avail=false;
+    if (device->time_device()) {
+      if (nbor_time_avail) {
+        nbor->time_nbor.add_to_total();
+        nbor->time_kernel.add_to_total();
+        nbor_time_avail=false;
+      }
+      time_kernel.add_to_total();
+      time_gayberne.add_to_total();
+      if (multiple_forms) {
+        time_kernel2.add_to_total();
+        time_gayberne2.add_to_total();
+        time_pair.add_to_total();
+      }
+      atom->acc_timers();
+      ans->acc_timers();
    }
-    time_kernel.add_to_total();
-    time_gayberne.add_to_total();
-    if (multiple_forms) {
-      time_kernel2.add_to_total();
-      time_gayberne2.add_to_total();
-      time_pair.add_to_total();
-    }
-    atom->acc_timers();
  }
  
  /// Accumulate timers
@ -117,6 +129,7 @@ class GB_GPU_Memory {
      time_pair.zero();
    }
    atom->zero_timers();
+    ans->zero_timers();
  }

  // -------------------------- DEVICE DATA ------------------------- 
@ -168,6 +181,10 @@ class GB_GPU_Memory {

  int last_ellipse, max_last_ellipse;

+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
  // --------------------------- NBOR DATA ----------------------------

  /// Neighbor data
@ -183,10 +200,12 @@ class GB_GPU_Memory {
  UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
  inline int block_size() { return _block_size; }

+  int _threads_per_atom;
 private:
  bool _allocated, _compiled;
  int _block_size;
  double _max_bytes;
+  double _gpu_overhead, _driver_overhead;
  
  void compile_kernels(UCL_Device &dev);
 };
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@ -1,2 +1,2 @@
-Geryon Version 10.280
- 
+Geryon Version 11.094
+ 
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@ -167,6 +167,7 @@ class UCL_Device {
  int _device, _num_devices;
  std::vector<cudaDeviceProp> _properties;
  std::vector<cudaStream_t> _cq;
+  std::vector<int> _device_ids;
 };

 // Grabs the properties for all devices
@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() {
    if (deviceProp.major == 9999 && deviceProp.minor == 9999)
      break;
    _properties.push_back(deviceProp);
+    _device_ids.push_back(dev);
  }
  _device=-1;
  _cq.push_back(cudaStream_t());
@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) {
    return;
  for (int i=1; i<num_queues(); i++) pop_command_queue();
  cudaThreadExit();
-  CUDA_SAFE_CALL_NS(cudaSetDevice(num));
+  CUDA_SAFE_CALL_NS(cudaSetDevice(_device_ids[num]));
  _device=num;
 }

--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -42,6 +42,7 @@ inline void ucl_sync(CUstream &stream) {
 }

 struct NVDProperties {
+  int device_id;
  std::string name;
  int major;
  int minor;
@ -208,15 +209,20 @@ inline UCL_Device::UCL_Device() {
  for (int dev=0; dev<_num_devices; ++dev) {
    CUdevice m;
    CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
+    int major, minor;
+    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
+    if (major==9999)
+      continue;
+      
    _properties.push_back(NVDProperties());
+    _properties.back().device_id=dev;
+    _properties.back().major=major;
+    _properties.back().minor=minor;
    
    char namecstr[1024];
    CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
    _properties.back().name=namecstr;
    
-    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
-                                              &_properties.back().minor,m));
-    
    CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
                                       CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@ -262,9 +268,9 @@ inline void UCL_Device::set(int num) {
    CU_SAFE_CALL_NS(cuCtxDestroy(_context));
    for (int i=1; i<num_queues(); i++) pop_command_queue();
  }
-  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
+  _device=_properties[num].device_id;
+  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
  CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
-  _device=num;
 }

 // List all devices along with all properties
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@ -25,6 +25,7 @@
 #define NVD_TIMER_H

 #include "nvd_macros.h"
+#include "nvd_device.h"

 namespace ucl_cudadr {

@ -66,12 +67,23 @@ class UCL_Timer {
  /// Stop timing on command queue
  inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
  
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CU_SAFE_CALL(cuEventSynchronize(start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
+
  /// Set the time elapsed to zero (not the total_time)
  inline void zero() {
    CU_SAFE_CALL(cuEventRecord(start_event,_cq));
    CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
  }
  
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
  inline double add_to_total() 
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -25,6 +25,7 @@
 #define OCL_TIMER_H

 #include "ocl_macros.h"
+#include "ocl_device.h"

 namespace ucl_opencl {

@ -67,10 +68,21 @@ class UCL_Timer {
  /// Stop timing on default command queue
  inline void stop() { clEnqueueMarker(_cq,&stop_event); }
  
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
+
  /// Set the time elapsed to zero (not the total_time)
  inline void zero() 
    { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
  
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
  /// Add time from previous start and stop to total
  /** Forces synchronization **/
  inline double add_to_total() 
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@ -13,7 +13,7 @@
    copyright            : (C) 2010 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -206,6 +206,191 @@
    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
  }

+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29, class t30>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+  }
+

 // ---------------------------------------------------------------------------

@ -439,6 +624,211 @@
    run();
  }

+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29, class t30>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    run();
+  }
+
 // ---------------------------------------------------------------------------

  template <class t1>
@ -671,3 +1061,208 @@
    run(cq);
  }

+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29, class t30>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    run(cq);
+  }
+
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@ -13,7 +13,7 @@
    copyright            : (C) 2009 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
    clear();
-    _kind=kind;
-    _rows=rows;
-    _cols=cols;
+
    int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
-    _row_size=_pitch/sizeof(numtyp);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+_row_size*cols;
-    #endif
-    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " 
                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
      exit(1);
+      #endif
+      return err;
    }
+
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
    clear();
-    _kind=kind;
-    _rows=rows;
-    _cols=cols;
+
    int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
-    _row_size=_pitch/sizeof(numtyp);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+_row_size*cols;
-    #endif
-    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate "
                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
      exit(1);
+      #endif
+      return err;
    }
+
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -13,7 +13,7 @@
    copyright            : (C) 2009 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat {
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
                        
    clear();
-    _kind=kind;
-    _cols=cols;
+
    _row_bytes=cols*sizeof(numtyp);
    int err=_device_alloc(*this,cq,_row_bytes,kind);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+cols;
-    #endif
-    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on device.\n";
+      _row_bytes=0;
      exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
    }
+
+    _kind=kind;
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat {
  inline int alloc(const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
    clear();
-    _kind=kind;
-    _cols=cols;
    _row_bytes=cols*sizeof(numtyp);
    int err=_device_alloc(*this,device,_row_bytes,kind);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+cols;
-    #endif
-    #ifndef UCL_NO_EXIT
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on device.\n";
+      _row_bytes=0;
      exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
    }
+
+    _kind=kind;
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
    #endif
    #ifdef _OCL_MAT
    _offset=0;
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@ -13,7 +13,7 @@
    copyright            : (C) 2009 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
   
-  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
+  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
+    #ifdef _OCL_MAT
+    _carray=(cl_mem)(0);
+    #endif
+  }
  ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
  
  /// Construct with specied number of rows and columns
@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
-    _rows=rows;
+
    _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
-    int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
-    #ifndef UCL_NO_EXIT
+    int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
                << " bytes on host.\n";
+      _row_bytes=0;
      exit(1);
+      #endif 
+      _row_bytes=0;
+      return err;
    }
-    #endif 
+
+    _cols=cols;
+    _rows=rows;
+    _kind=kind;
    _end=_array+rows*cols;
    return err;
  }    
@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat {
  inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
-    _rows=rows;
+
    _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
-    int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
-    _end=_array+rows*cols;
-    #ifndef UCL_NO_EXIT
+    int err=_host_alloc(*this,device,_row_bytes*rows,kind);
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
                << " bytes on host.\n";
+      _row_bytes=0;
      exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
    }
-    #endif
+
+    _cols=cols;
+    _rows=rows;
+    _kind=kind;
+    _end=_array+rows*cols;
    return err;
  }    
  
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -13,7 +13,7 @@
    copyright            : (C) 2009 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat {
   };
   typedef numtyp data_type; 
   
-  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
+  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
+    #ifdef _OCL_MAT
+    _carray=(cl_mem)(0);
+    #endif
+  }
  ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
  
  /// Construct with n columns
@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat {
  inline int alloc(const size_t cols, mat_type &cq,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
+
    _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
    int err=_host_alloc(*this,cq,_row_bytes,kind);
-    _end=_array+cols;
-    #ifndef UCL_NO_EXIT
+
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on host.\n";
+      _row_bytes=0;
      exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
    }
-    #endif 
+
+    _cols=cols;
+    _kind=kind;
+    _end=_array+cols;
    return err;
  }    

@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat {
  inline int alloc(const size_t cols, UCL_Device &device,
                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
    clear();
-    _cols=cols;
+
    _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
    int err=_host_alloc(*this,device,_row_bytes,kind);
-    _end=_array+cols;
-    #ifndef UCL_NO_EXIT
+
    if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
      std::cerr << "UCL Error: Could not allocate " << _row_bytes
                << " bytes on host.\n";
+      _row_bytes=0;
      exit(1);
+      #endif 
+      _row_bytes=0;
+      return err;
    }
-    #endif 
+
+    _cols=cols;
+    _kind=kind;
+    _end=_array+cols;
    return err;
  }
  
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@ -13,7 +13,7 @@
    copyright            : (C) 2010 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
   Copyright (2010) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@ -25,8 +25,18 @@
 #ifndef UCL_NV_KERNEL_H
 #define UCL_NV_KERNEL_H

-#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
-#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
+#if (__CUDA_ARCH__ < 200)
+#define mul24 __mul24
+#define MEM_THREADS 16
+#else
+#define mul24(X,Y) (X)*(Y)
+#define MEM_THREADS 32
+#endif
+
+#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
+#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
+#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
+#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
 #define BLOCK_ID_X blockIdx.x
@ -35,8 +45,9 @@
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
-#define mul24 __mul24
 #define __global  
 #define __inline static __inline__ __device__ 
+#define atom_add atomicAdd

 #endif
+
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@ -28,11 +28,11 @@ static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen) {
+int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen) {
  LJ96MF.clear();
  gpu_mode=LJ96MF.device->gpu_mode();
  double gpu_split=LJ96MF.device->particle_split();
@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                             host_lj4, offset, special_lj, inum, nall, 300,
-                             maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300,
+                        maxspecial, cell_size, gpu_split, screen);

  LJ96MF.device->world_barrier();
  if (message)
@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, 
-                              nall, 300, maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum,  nall, 300, maxspecial,
+                          cell_size, gpu_split, screen);
+
    LJ96MF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJ96MF.estimate_gpu_overhead();
+  return init_ok;
 }

 void lj96_gpu_clear() {
  LJ96MF.clear();
 }

-int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** lj96_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial,
+                         double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success) {
-  return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success);
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success) {
+  return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
-void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success) {
-  LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success) {
+  LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }

 double lj96_gpu_bytes() {
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJ96_GPU_KERNEL
 #define LJ96_GPU_KERNEL

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;

 #ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline

 #define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -157,8 +172,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -176,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const int nall, const int nbor_pitch,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
-  if (ii<4)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -258,8 +328,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-bool LJ96_GPU_MemoryT::init(const int ntypes,
+int LJ96_GPU_MemoryT::init(const int ntypes,
                           double **host_cutsq, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,lj96_cut_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj96_cut_gpu_kernel);
+  if (success!=0)
+    return success;

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,

  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }

 template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch);
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lj96_cut_gpu_memory.h
+++ b/lib/gpu/lj96_cut_gpu_memory.h
@ -29,13 +29,20 @@ class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
-            double **host_lj2, double **host_lj3, double **host_lj4,
-            double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@ -28,12 +28,11 @@ static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljl_gpu_init(const int ntypes, double **cutsq,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors,  const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen) {
  LJLMF.clear();
  gpu_mode=LJLMF.device->gpu_mode();
  double gpu_split=LJLMF.device->particle_split();
@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);

  LJLMF.device->world_barrier();
  if (message)
@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
+
    LJLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJLMF.estimate_gpu_overhead();
+  return init_ok;
 }

 void ljl_gpu_clear() {
  LJLMF.clear();
 }

-int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int ** ljl_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success) {
-  return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, cpu_time, success);
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
-void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success) {
-  LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }

--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;

 #ifdef _DOUBLE_DOUBLE
@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline

 #define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[4];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int itype=ix.w;

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -156,8 +171,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -175,49 +229,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed, 
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const int nall, const int nbor_pitch,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
-  if (ii<4)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

    numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
  
      int j=*nbor;
      factor_lj = sp_lj[sbmask(j)];
@ -256,8 +326,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@ -42,22 +42,26 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-bool LJL_GPU_MemoryT::init(const int ntypes, 
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,lj_cut_gpu_kernel);
+int LJL_GPU_MemoryT::init(const int ntypes, 
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_cut_gpu_kernel);
+  if (success!=0)
+    return success;

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes,

  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }

 template <class numtyp, class acctyp>
@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch);
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lj_cut_gpu_memory.h
+++ b/lib/gpu/lj_cut_gpu_memory.h
@ -29,13 +29,20 @@ class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@ -28,13 +28,13 @@ static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double **host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e) {
+int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, double **host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e) {
  LJCMF.clear();
  gpu_mode=LJCMF.device->gpu_mode();
  double gpu_split=LJCMF.device->particle_split();
@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                            qqrd2e);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                       host_cut_coulsq, host_special_coul, qqrd2e);

  LJCMF.device->world_barrier();
  if (message)
@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen, host_cut_ljsq, host_cut_coulsq,
-                              host_special_coul, qqrd2e);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e);
+
    LJCMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJCMF.estimate_gpu_overhead();
+  return init_ok;
 }

 void ljc_gpu_clear() {
  LJCMF.clear();
 }

-int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** ljc_gpu_compute_n(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                        double *sublo, double *subhi, int *tag, int **nspecial, 
                        int **special, const bool eflag, const bool vflag,
                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success, double *host_q) {
-  return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, cpu_time, success, host_q);
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success, double *host_q, double *boxlo,
+                        double *prd) {
+  return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success,
+                       host_q, boxlo, prd);
 }  
 			
-void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     const int nlocal, double *boxlo, double *prd) {
+  LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                nlocal,boxlo,prd);
 }

 double ljc_gpu_bytes() {
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJC_GPU_KERNEL
 #define LJC_GPU_KERNEL

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -46,7 +44,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;

@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q)

 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_ , __global numtyp *cutsq,
-                          const numtyp qqrd2e) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                          __global numtyp *cutsq, const numtyp qqrd2e,
+                          const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -109,29 +113,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
  
+  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -188,8 +204,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -209,54 +266,69 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nall, const int nbor_pitch,
                               __global numtyp *q_ , __global numtyp *_cutsq,
-                               const numtyp qqrd2e) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const numtyp qqrd2e, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
-    cutsq[ii]=_cutsq[ii];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    cutsq[tid]=_cutsq[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -312,8 +384,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii

-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@ -43,24 +43,28 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-bool LJC_GPU_MemoryT::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, double **host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,ljc_cut_gpu_kernel);
+int LJC_GPU_MemoryT::init(const int ntypes,
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen,
+                          double **host_cut_ljsq, double **host_cut_coulsq,
+                          double *host_special_coul, const double qqrd2e) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,ljc_cut_gpu_kernel);
+  if (success!=0)
+    return success;

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                   sp_lj.row_bytes();
-  return true;
+  return 0;
 }

 template <class numtyp, class acctyp>
@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e);
+                          &_qqrd2e, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e);
+                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/ljc_cut_gpu_memory.h
+++ b/lib/gpu/ljc_cut_gpu_memory.h
@ -29,15 +29,22 @@ class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
-            double **host_lj2, double **host_lj3, double **host_lj4,
-            double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size,
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            double **host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           double **host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
--- a/lib/gpu/ljcl_cut_gpu.cpp
+++ b/lib/gpu/ljcl_cut_gpu.cpp
@ -28,14 +28,14 @@ static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen,
-                   double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald) {
+int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
  LJCLMF.clear();
  gpu_mode=LJCLMF.device->gpu_mode();
  double gpu_split=LJCLMF.device->particle_split();
@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
    fflush(screen);
  }

-  if (world_me==0) {
-    bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                            qqrd2e,g_ewald);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                        offset, special_lj, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

  LJCLMF.device->world_barrier();
  if (message)
@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                last_gpu,i);
      fflush(screen);
    }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen, host_cut_ljsq, host_cut_coulsq,
-                              host_special_coul, qqrd2e, g_ewald);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
    LJCLMF.device->gpu_barrier();
    if (message) 
      fprintf(screen,"Done.\n");
  }
  if (message)
    fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJCLMF.estimate_gpu_overhead();
+  return init_ok;
 }

 void ljcl_gpu_clear() {
  LJCLMF.clear();
 }

-int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
-  return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success, host_q);
+                         int **ilist, int **jnum,  const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
 }  
 			
-void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+                host_q,nlocal,boxlo,prd);
 }

 double ljcl_gpu_bytes() {
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@ -18,8 +18,6 @@
 #ifndef LJCL_GPU_KERNEL
 #define LJCL_GPU_KERNEL

-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@ -54,7 +52,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;

@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)

 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8

 #endif

@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp4* lj3, const int lj_types, 
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_ , const numtyp cut_coulsq,
-                          const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp sp_lj[8];
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
  
+  if (ii<inum) {
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int itype=ix.w;

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -204,8 +220,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
      }

    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;

-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
@ -225,52 +282,68 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,

 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
                               const int nall, const int nbor_pitch,
                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const numtyp qqrd2e, const numtyp g_ewald,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
    if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
  }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
  __syncthreads();
  
  if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
    __global int *nbor=dev_nbor+ii;
    int i=*nbor;
    nbor+=nbor_pitch;
    int numj=*nbor;
    nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
  
    numtyp4 ix=fetch_pos(i,x_); //x_[i];
    numtyp qtmp=fetch_q(i,q_);
    int iw=ix.w;
    int itype=mul24((int)MAX_SHARED_TYPES,iw);

-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
      int j=*nbor;

      numtyp factor_lj, factor_coul;
@ -334,8 +407,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
      }

    } // for nbor
+  } // if ii

-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
    __global acctyp *ap1=engv+ii;
    if (eflag>0) {
      *ap1=energy;
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }

 template <class numtyp, class acctyp>
-bool LJCL_GPU_MemoryT::init(const int ntypes,
+int LJCL_GPU_MemoryT::init(const int ntypes,
                           double **host_cutsq, double **host_lj1, 
                           double **host_lj2, double **host_lj3, 
                           double **host_lj4, double **host_offset, 
@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
                           double **host_cut_ljsq, const double host_cut_coulsq,
                           double *host_special_coul, const double qqrd2e,
                           const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,ljcl_cut_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,ljcl_cut_gpu_kernel);
+  if (success!=0)
+    return success;

  // If atom type constants fit in shared memory use fast kernel
  int lj_types=ntypes;
  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
    shared_types=true;
  }
  _lj_types=lj_types;
@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,

  _allocated=true;
  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }

 template <class numtyp, class acctyp>
@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
  else
    vflag=0;
  
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));

-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                          &lj3.begin(), &sp_lj.begin(),
                          &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                          &ainum, &anall, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald);
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/ljcl_cut_gpu_memory.h
+++ b/lib/gpu/ljcl_cut_gpu_memory.h
@ -29,15 +29,22 @@ class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param max_nbors initial number of rows in the neighbor matrix
    * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);

  /// Clear all host and device data
  /** \note This is called at the beginning of the init() routine **/
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@ -29,9 +29,8 @@ __win_sort _win_sort;
 #endif

 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
-                              _vflag(false),_inum(0),_ilist(NULL), 
-                              _newton(false) {
+PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
+                              _max_gpu_bytes(0) {
  #ifndef USE_OPENCL
  sort_config.op = CUDPP_ADD;
  sort_config.datatype = CUDPP_UINT;
@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const {
  int id_space=0;
  if (_gpu_nbor)
    id_space=2;
-  int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
+  int bytes=4*sizeof(numtyp)+id_space;
  if (_rot)
-    bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
+    bytes+=4*sizeof(numtyp);
  if (_charge)
    bytes+=sizeof(numtyp);
  return bytes;
 }

 template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int inum, const int nall) {
+bool PairGPUAtomT::alloc(const int nall) {
  _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-  if (_newton)
-    _max_local=_max_atoms;
-  else
-    _max_local=static_cast<int>(static_cast<double>(inum)*1.10);

  bool success=true;
  
-  int ans_elements=4;
-  if (_rot)
-    ans_elements+=4;
-  
  // Ignore host/device transfers?
  bool cpuview=false;
  if (dev->device_type()==UCL_CPU)
@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
  success=success && (host_x.alloc(_max_atoms*4,*dev,
                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  #endif                      
-  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
-  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
  // Buffer for casting only if different precisions
  if (_charge)
    success=success && (host_q.alloc(_max_atoms,*dev,
@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {

    
  // ---------------------------  Device allocations
-  _gpu_bytes=0;
+  int gpu_bytes=0;
  if (cpuview) {
    #ifdef GPU_CAST
    assert(0==1);
    #else
    dev_x.view(host_x);
    #endif
-    dev_engv.view(host_engv);
-    dev_ans.view(host_ans);
    if (_rot)
      dev_quat.view(host_quat);
    if (_charge)
@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
                        dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
    success=success && (UCL_SUCCESS==
                        dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
+    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
    #else
    success=success && (UCL_SUCCESS==
                        dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
    #endif
-    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*_max_local,
-                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
    if (_charge) {
      success=success && (dev_q.alloc(_max_atoms,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_q.row_bytes();
+      gpu_bytes+=dev_q.row_bytes();
    }
    if (_rot) {
      success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                      UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_quat.row_bytes();
+      gpu_bytes+=dev_quat.row_bytes();
    }
  }
  if (_gpu_nbor) {
    success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
    success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
+    gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
    if (_bonds) {
      success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-      _gpu_bytes+=dev_tag.row_bytes();
+      gpu_bytes+=dev_tag.row_bytes();
    }
  }

-  _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
+  gpu_bytes+=dev_x.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
  
  _allocated=true;  
  return success;
 }

 template <class numtyp, class acctyp>
-bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
-                        const bool rot, UCL_Device &devi, const bool gpu_nbor,
+bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
+                              const bool gpu_nbor, const bool bonds) {
+  bool realloc=false;
+  if (charge && _charge==false) {
+    _charge=true;
+    realloc=true;
+  }
+  if (rot && _rot==false) {
+    _rot=true;
+    realloc=true;
+  }
+  if (gpu_nbor && _gpu_nbor==false) {
+    _gpu_nbor=true;
+    realloc=true;
+  }
+  if (bonds && _bonds==false) {
+    _bonds=true;
+    realloc=true;
+  }
+  if (realloc) {
+    _other=_charge || _rot;
+    int max_atoms=_max_atoms;
+    clear_resize();
+    return alloc(max_atoms);
+  }
+  return true;
+}
+
+template <class numtyp, class acctyp>
+bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
+                        UCL_Device &devi, const bool gpu_nbor,
                        const bool bonds) {
  clear();

  bool success=true;
+  _x_avail=false;
+  _q_avail=false;
+  _quat_avail=false;
+  _resized=false;
  _gpu_nbor=gpu_nbor;
  _bonds=bonds;
  _charge=charge;
@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
  _other=_charge || _rot;
  dev=&devi;

-  _e_fields=1;
-  if (_charge)
-    _e_fields++;
-  _ev_fields=6+_e_fields;
-    
  // Initialize atom and nbor data
-  int ef_inum=inum;
-  if (ef_inum==0)
-    ef_inum=1000;
  int ef_nall=nall;
-  if (ef_nall<=ef_inum)
-    ef_nall=ef_inum*2;
+  if (ef_nall==0)
+    ef_nall=2000;
  
  // Initialize timers for the selected device
  time_pos.init(*dev);
-  time_other.init(*dev);
-  time_answer.init(*dev);
+  time_q.init(*dev);
+  time_quat.init(*dev);
  time_pos.zero();
-  time_other.zero();
-  time_answer.zero();
+  time_q.zero();
+  time_quat.zero();
  _time_cast=0.0;
  
  #ifdef GPU_CAST
  compile_kernels(*dev);
  #endif
  
-  return success && alloc(ef_inum,ef_nall);
+  return success && alloc(ef_nall);
 }
  
 template <class numtyp, class acctyp>
@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() {
    dev_quat.clear();
    host_quat.clear();
  }
-  dev_ans.clear();
-  dev_engv.clear();
  #ifndef GPU_CAST
  host_x.clear();
  #else
  host_x_cast.clear();
  host_type_cast.clear();
  #endif
-  host_ans.clear();
-  host_engv.clear();
  dev_cell_id.clear();
  dev_particle_id.clear();
  dev_tag.clear();
@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() {

 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
-  _gpu_bytes=0;
+  _max_gpu_bytes=0;
  if (!_allocated)
    return;

  time_pos.clear();
-  time_other.clear();
-  time_answer.clear();
+  time_q.clear();
+  time_quat.clear();
  clear_resize();
-  _inum=0;
-  _eflag=false;
-  _vflag=false;

  #ifdef GPU_CAST
  if (_compiled) {
@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const {
    atom_bytes+=1;
  if (_rot) 
    atom_bytes+=4;
-  int ans_bytes=atom_bytes+_ev_fields;
  return _max_atoms*atom_bytes*sizeof(numtyp)+
-         ans_bytes*(_max_local)*sizeof(acctyp)+
         sizeof(PairGPUAtom<numtyp,acctyp>);
 }
  
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom) {
-  time_answer.start();
-  _eflag=eflag;
-  _vflag=vflag;
-  _ef_atom=ef_atom;
-  _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
-  if (!eflag)
-    csize-=_e_fields;
-  if (!vflag)
-    csize-=6;
-      
-  if (csize>0)
-    ucl_copy(host_engv,dev_engv,_inum*csize,true);
-  if (_rot)
-    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
-  else
-    ucl_copy(host_ans,dev_ans,_inum*4,true);
-  time_answer.stop();
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom,
-                                int *ilist) {
-  _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial) {
-  if (_eflag==false && _vflag==false)
-    return 0.0;
-
-  double evdwl=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial, double &ecoul) {
-  if (_eflag==false && _vflag==false) {
-    ecoul=0.0;
-    return 0.0;
-  }
-
-  if (_charge==false)
-    return energy_virial(eatom,vatom,virial);
-
-  double evdwl=0.0;
-  double _ecoul=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  ecoul+=_ecoul*0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::get_answers(double **f, double **tor) {
-  acctyp *ap=host_ans.begin();
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=*ap;
-      ap++;
-      f[i][1]+=*ap;
-      ap++;
-      f[i][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=*ap;
-        ap++;
-        tor[i][1]+=*ap;
-        ap++;
-        tor[i][2]+=*ap;
-        ap+=2;
-      }
-    }
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int ii=_ilist[i];
-      f[ii][0]+=*ap;
-      ap++;
-      f[ii][1]+=*ap;
-      ap++;
-      f[ii][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        int ii=_ilist[i];
-        tor[ii][0]+=*ap;
-        ap++;
-        tor[ii][1]+=*ap;
-        ap++;
-        tor[ii][2]+=*ap;
-        ap+=2;
-      }
-    }
-  }
-}
-
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void PairGPUAtomT::sort_neighbor(const int num_atoms) {
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@ -23,7 +23,6 @@

 #ifdef USE_OPENCL

-#include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
@ -32,7 +31,6 @@ using namespace ucl_opencl;
 #else

 #include "cudpp.h"
-#include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
@ -40,10 +38,6 @@ using namespace ucl_cudadr;

 #endif

-#ifndef int2
-struct int2 { int x; int y; };
-#endif
-
 #include "pair_gpu_precision.h"

 template <class numtyp, class acctyp>
@ -56,13 +50,9 @@ class PairGPUAtom {
  inline int max_atoms() const { return _max_atoms; }
  /// Current number of local+ghost atoms stored
  inline int nall() const { return _nall; }
-  /// Current number of local atoms stored
-  inline int inum() const { return _inum; }

  /// Set number of local+ghost atoms for future copy operations
  inline void nall(const int n) { _nall=n; }
-  /// Set number of local atoms for future copy operations
-  inline void inum(const int n) { _inum=n; }
  
  /// Memory usage per atom in this class
  int bytes_per_atom() const; 
@ -70,21 +60,33 @@ class PairGPUAtom {
  /// Clear any previous data and set up for a new LAMMPS run
  /** \param rot True if atom storage needs quaternions
    * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int inum, const int nall, const bool charge, const bool rot, 
+  bool init(const int nall, const bool charge, const bool rot, 
            UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
  
  /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, const int nall, bool &success) {
-    _inum=inum;
+  /** Returns true if resized with any call during this timestep **/
+  inline bool resize(const int nall, bool &success) {
    _nall=nall;
-    if (inum>_max_local || nall>_max_atoms) {
+    if (nall>_max_atoms) {
      clear_resize();
-      success = success && alloc(inum,nall);
-      return true;
+      success = success && alloc(nall);
+      _resized=true;
    }
-    return false;
+    return _resized;
  }
-
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
+                  const bool bonds);
+  
+  /// Returns true if GPU is using charges
+  bool charge() { return _charge; }
+  
+  /// Returns true if GPU is using quaternions
+  bool quat() { return _rot; }
+  
  /// Only free matrices of length inum or nall for resizing
  void clear_resize();
  
@ -100,28 +102,42 @@ class PairGPUAtom {
  /// Add copy times to timers
  inline void acc_timers() {
    time_pos.add_to_total();
-    time_answer.add_to_total();
-    if (_other)
-      time_other.add_to_total();
+    if (_charge)
+      time_q.add_to_total();
+    if (_rot)
+      time_quat.add_to_total();
  }

  /// Add copy times to timers
  inline void zero_timers() {
    time_pos.zero();
-    time_answer.zero();
-    if (_other)
-      time_other.zero();
+    if (_charge)
+      time_q.zero();
+    if (_rot)
+      time_quat.zero();
  }

  /// Return the total time for host/device data transfer
+  /** Zeros the total so that the atom times are only included once **/
  inline double transfer_time() {
-    double total=time_pos.total_seconds()+time_answer.total_seconds();
-    if (_other) total+=time_other.total_seconds();
+    double total=time_pos.total_seconds();
+    time_pos.zero_total();
+    if (_charge) {
+      total+=time_q.total_seconds();
+      time_q.zero_total();
+    }
+    if (_rot) {
+      total+=time_q.total_seconds();
+      time_quat.zero_total();
+    }
+    
    return total;
  }
  
  /// Return the total time for data cast/pack
-  inline double cast_time() { return _time_cast; }
+  /** Zeros the time so that atom times are only included once **/
+  inline double cast_time() 
+    { double t=_time_cast; _time_cast=0.0; return t; }

  /// Pack LAMMPS atom type constants into matrix and copy to device
  template <class dev_typ, class t1>
@ -216,43 +232,52 @@ class PairGPUAtom {

  // -------------------------COPY TO GPU ----------------------------------

+  /// Signal that we need to transfer atom data for next timestep
+  inline void data_unavail()
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
+
  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
-    double t=MPI_Wtime();
-    #ifdef GPU_CAST
-    memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-    memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
-    #else
-    numtyp *_write_loc=host_x.begin();
-    for (int i=0; i<_nall; i++) {
-      *_write_loc=host_ptr[i][0];
-      _write_loc++;
-      *_write_loc=host_ptr[i][1];
-      _write_loc++;
-      *_write_loc=host_ptr[i][2];
-      _write_loc++;
-      *_write_loc=host_type[i];
-      _write_loc++;
+    if (_x_avail==false) {
+      double t=MPI_Wtime();
+      #ifdef GPU_CAST
+      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      #else
+      numtyp *_write_loc=host_x.begin();
+      for (int i=0; i<_nall; i++) {
+        *_write_loc=host_ptr[i][0];
+        _write_loc++;
+        *_write_loc=host_ptr[i][1];
+        _write_loc++;
+        *_write_loc=host_ptr[i][2];
+        _write_loc++;
+        *_write_loc=host_type[i];
+        _write_loc++;
+      }
+      #endif
+      _time_cast+=MPI_Wtime()-t;
    }
-    #endif
-    _time_cast+=MPI_Wtime()-t;
-  }      
+  }

  /// Copy positions and types to device asynchronously
  /** Copies nall() elements **/
  inline void add_x_data(double **host_ptr, int *host_type) { 
    time_pos.start();
-    #ifdef GPU_CAST
-    ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
-    ucl_copy(dev_type_cast,host_type_cast,_nall,true);
-    int block_size=64;
-    int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-    k_cast_x.set_size(GX,block_size);
-    k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
-                 &_nall);
-    #else
-    ucl_copy(dev_x,host_x,_nall*4,true);
-    #endif
+    if (_x_avail==false) {
+      #ifdef GPU_CAST
+      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
+      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
+      int block_size=64;
+      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
+      k_cast_x.set_size(GX,block_size);
+      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
+                   &_nall);
+      #else
+      ucl_copy(dev_x,host_x,_nall*4,true);
+      #endif
+      _x_avail=true;
+    }
    time_pos.stop();
  }

@ -262,87 +287,68 @@ class PairGPUAtom {
    add_x_data(host_ptr,host_type);
  }

-  /// Cast charges to write buffer
+  // Cast charges to write buffer
  template<class cpytyp>
  inline void cast_q_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_q.view((numtyp*)host_ptr,_nall,*dev);
-        dev_q.view(host_q);
-      } else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+    if (_q_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_q.view((numtyp*)host_ptr,_nall,*dev);
+          dev_q.view(host_q);
+        } else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
    }
-    _time_cast+=MPI_Wtime()-t;
  }

-  /// Copy charges to device asynchronously
+  // Copy charges to device asynchronously
  inline void add_q_data() {
-    ucl_copy(dev_q,host_q,_nall,true);
+    if (_q_avail==false) {
+      ucl_copy(dev_q,host_q,_nall,true);
+      _q_avail=true;
+    }
  }

-  /// Cast quaternions to write buffer
+  // Cast quaternions to write buffer
  template<class cpytyp>
  inline void cast_quat_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
-        dev_quat.view(host_quat);
-      } else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
+          dev_quat.view(host_quat);
+        } else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
    }
-    _time_cast+=MPI_Wtime()-t;
  }

-  /// Copy quaternions to device
+  // Copy quaternions to device
  /** Copies nall()*4 elements **/
  inline void add_quat_data() {
-    ucl_copy(dev_quat,host_quat,_nall*4,true);
+    if (_quat_avail==false) {
+      ucl_copy(dev_quat,host_quat,_nall*4,true);
+      _quat_avail=true;
+    }
  }

-  /// Copy data other than pos and data to device
-  inline void add_other_data() {
-    time_other.start();
-    if (_charge)
-      add_q_data();
-    if (_rot)
-      add_quat_data();
-    time_other.stop();
-  }
-  
  /// Return number of bytes used on device
-  inline double gpu_bytes() { return _gpu_bytes; } 
-
-  // -------------------------COPY FROM GPU -------------------------------
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
-  
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial);
-
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial,
-                       double &ecoul);
-
-  /// Add forces and torques from the GPU into a LAMMPS pointer
-  void get_answers(double **f, double **tor);
+  inline double max_gpu_bytes() 
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 

  // ------------------------------ DATA ----------------------------------

@ -352,10 +358,6 @@ class PairGPUAtom {
  UCL_D_Vec<numtyp> dev_q;
  /// Quaterions
  UCL_D_Vec<numtyp> dev_quat;
-  /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
-  /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
  
  #ifdef GPU_CAST
  UCL_D_Vec<double> dev_x_cast;
@ -370,10 +372,6 @@ class PairGPUAtom {
  UCL_H_Vec<numtyp> host_q;
  /// Buffer for moving quat data to GPU
  UCL_H_Vec<numtyp> host_quat;
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
  
  /// Cell list identifiers for device nbor builds
  UCL_D_Vec<unsigned> dev_cell_id;
@ -383,7 +381,7 @@ class PairGPUAtom {
  UCL_D_Vec<int> dev_tag;

  /// Device timers
-  UCL_Timer time_pos, time_other, time_answer;
+  UCL_Timer time_pos, time_q, time_quat;
  
  /// Geryon device
  UCL_Device *dev;
@ -396,19 +394,19 @@ class PairGPUAtom {
  #endif

  bool _compiled;
-
-  bool alloc(const int inum, const int nall);
  
-  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
+  // True if data has been copied to device already
+  bool _x_avail, _q_avail, _quat_avail, _resized;
+
+  bool alloc(const int nall);
+  
+  bool _allocated, _rot, _charge, _other;
+  int _max_atoms, _nall;
  bool _gpu_nbor, _bonds;
-  int *_ilist;
  double _time_cast;
  
-  double _gpu_bytes;
+  double _max_gpu_bytes;
  
-  bool _newton;
-
  #ifndef USE_OPENCL
  CUDPPConfiguration sort_config;
  CUDPPHandle sort_plan;
--- a/lib/gpu/pair_gpu_balance.h
+++ b/lib/gpu/pair_gpu_balance.h
@ -23,7 +23,7 @@

 #define _HD_BALANCE_EVERY 25
 #define _HD_BALANCE_WEIGHT 0.5
-#define _HD_BALANCE_GAP 1.05
+#define _HD_BALANCE_GAP 1.10

 /// Host/device load balancer
 template<class numtyp, class acctyp>
@ -33,7 +33,8 @@ class PairGPUBalance {
  inline ~PairGPUBalance() { clear(); }

  /// Clear any old data and setup for new LAMMPS run
-  inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const double split);
+  inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
+                   const double split);

  /// Clear all host and device data
  inline void clear() {
@ -43,23 +44,25 @@ class PairGPUBalance {
      _init_done=false;
    }
  }
+  
+  /// Return the timestep since initialization
+  inline int timestep() { return _timestep; }

  /// Get a count of the number of particles host will handle for initial alloc
-  inline int first_host_count(const int nlocal,const bool gpu_nbor,
-                              const double gpu_split) const {
+  inline int first_host_count(const int nlocal, const double gpu_split,
+                              const bool gpu_nbor) const {
    int host_nlocal=0;
    if (gpu_nbor && gpu_split!=1.0) {
      if (gpu_split>0)
        host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
      else
-        host_nlocal=static_cast<int>(ceil(0.1*nlocal));
+        host_nlocal=static_cast<int>(ceil(0.05*nlocal));
    }
    return host_nlocal;
  }

  /// Return the number of particles the device will handle this timestep
-  inline int get_gpu_count(const int timestep, const int ago,
-                           const int inum_full);
+  inline int get_gpu_count(const int ago, const int inum_full);

  /// Return the average fraction of particles handled by device on all procs
  inline double all_avg_split() {
@ -82,10 +85,10 @@ class PairGPUBalance {
    if (_measure_this_step) {
      _device->gpu->sync();
      _device->gpu_barrier();
+      _device->start_host_timer();
      _device_time.start();
      _device->gpu->sync();
      _device->gpu_barrier();
-      _device->start_host_timer();
    }
  }

@ -95,34 +98,34 @@ class PairGPUBalance {
  /// Calculate the new host/device split based on the cpu and device times
  /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
            (and first 10) **/
-  inline void balance(const double cpu_time, const bool gpu_nbor);
+  inline void balance(const double cpu_time);

  /// Calls balance() and then get_gpu_count()
-  inline int balance(const int timestep, const int ago, const int inum_full,
-                     const double cpu_time, const bool gpu_nbor) {
-    balance(cpu_time,gpu_nbor);
-    return get_gpu_count(timestep,ago,inum_full);
+  inline int balance(const int ago,const int inum_full,const double cpu_time) {
+    balance(cpu_time);
+    return get_gpu_count(ago,inum_full);
  }
  
 private:
  PairGPUDevice<numtyp,acctyp> *_device;
  UCL_Timer _device_time;
-  bool _init_done;
+  bool _init_done, _gpu_nbor;
  
  bool _load_balance;
  double _actual_split, _avg_split, _desired_split, _max_split;
  int _avg_count;

  bool _measure_this_step;
-  int _inum, _inum_full;
+  int _inum, _inum_full, _timestep;
 };

 #define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>

 template <class numtyp, class acctyp>
-void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
-			   const double split) {
+void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu, 
+                           const bool gpu_nbor, const double split) {
  clear();
+  _gpu_nbor=gpu_nbor;
  _init_done=true;
  
  _device=gpu;
@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
  
  if (split<0.0) {
    _load_balance=true;
-    _desired_split=0.9;
+    _desired_split=0.90;
  } else {
    _load_balance=false;
    _desired_split=split;
@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
  _actual_split=_desired_split;
  _avg_split=0.0;
  _avg_count=0;
+  _timestep=0;
 }

 template <class numtyp, class acctyp>
-int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
-			           const int inum_full) {
+int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
  _measure_this_step=false;
  if (_load_balance) {
-    if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) {
+    if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
      _measure_this_step=true;
      _inum_full=inum_full;
    }
@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
  }
  _inum=static_cast<int>(floor(_actual_split*inum_full));
  if (_inum==0) _inum++;
+  _timestep++;
  return _inum;
 }
    
 template <class numtyp, class acctyp>
-void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
+void PairGPUBalanceT::balance(const double cpu_time) {
  if (_measure_this_step) {
+    _measure_this_step=false;
+    double gpu_time=_device_time.seconds();
+
+    double max_gpu_time;
+    MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
+                  _device->gpu_comm());
+
    if (_inum_full==_inum) {
      _desired_split=1.0;
      return;
    }

-    _measure_this_step=false;
-    double gpu_time=_device_time.seconds();
+    double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
+    double cpu_other_time=_device->host_time()-cpu_time;
+    int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
+                                   cpu_time_per_atom);

-    double cpu_gpu_time[3], max_times[3];
-    cpu_gpu_time[0]=cpu_time/(_inum_full-_inum);
-    cpu_gpu_time[1]=gpu_time/_inum;
-    cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
+    double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
+    _desired_split=split*_HD_BALANCE_GAP;
+    if (_desired_split>1.0)
+      _desired_split=1.0;
+    if (_desired_split<0.0)
+      _desired_split=0.0;

-    MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
-                  _device->gpu_comm());
-    double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
-    split*=_HD_BALANCE_GAP;
-
-    if (split>1.0)
-      split=1.0;
-    if (_avg_count<10)
-      _desired_split=(_desired_split*_avg_count+split)/(_avg_count+1);
-    else
-      _desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+
-                     _HD_BALANCE_WEIGHT*split;
-
-    if (!gpu_nbor) {
+    if (!_gpu_nbor) {
      if (_desired_split<_max_split)
        _actual_split=_desired_split;
      else
        _actual_split=_max_split;
    }
+//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
  }
  _avg_split+=_desired_split;
  _avg_count++;
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@ -18,7 +18,7 @@

 #ifdef NV_KERNEL

-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> neigh_tex;

 #ifdef _DOUBLE_DOUBLE
@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #else

 #define fetch_pos(i,y) x_[i]
+#define BLOCK_NBOR_BUILD 64

 #endif

@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define numtyp4 float4
 #endif

-#define CELL_BLOCK_SIZE 64
-#define BLOCK_2D 8
+#define BLOCK_CELL_2D 8
+
+#define SBBITS 30

 #define SBBITS 30

 __kernel void transpose(int *out, int *in, int columns_in, int rows_in)
 {
-	__local float block[BLOCK_2D][BLOCK_2D+1];
+	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	
 	unsigned ti=THREAD_ID_X;
 	unsigned tj=THREAD_ID_Y;
 	unsigned bi=BLOCK_ID_X;
 	unsigned bj=BLOCK_ID_Y;
 	
-	unsigned i=bi*BLOCK_2D+ti;
-	unsigned j=bj*BLOCK_2D+tj;
+	unsigned i=bi*BLOCK_CELL_2D+ti;
+	unsigned j=bj*BLOCK_CELL_2D+tj;
 	if ((i<columns_in) && (j<rows_in))
 		block[tj][ti]=in[j*columns_in+i];

 	__syncthreads();

-	i=bj*BLOCK_2D+ti;
-	j=bi*BLOCK_2D+tj;
+	i=bj*BLOCK_CELL_2D+ti;
+	j=bi*BLOCK_CELL_2D+tj;
 	if ((i<rows_in) && (j<columns_in))
 		out[j*rows_in+i] = block[ti][tj];
 }
@ -141,7 +143,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 				     int *cell_particle_id, 
 				     int *cell_counts,
 				     int *nbor_list,
-				     int *host_nbor_list, 
+				     int *host_nbor_list,
+				     int *host_numj, 
 				     int neigh_bin_size, 
 				     numtyp cell_size,
 				     int ncellx, int ncelly, int ncellz,
@ -154,8 +157,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 	  
  int icell = ix + iy*ncellx + iz*ncellx*ncelly;

-  __shared__ int cell_list_sh[CELL_BLOCK_SIZE];
-  __shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE];
+  __shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
+  __shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];

  int icell_begin = cell_counts[icell];
  int icell_end = cell_counts[icell+1];
@ -185,9 +188,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
      neigh_list=neigh_counts+stride;
      nbor_list[pid_i]=pid_i;
    } else {
-      stride=nt-inum;
-    	neigh_counts=host_nbor_list+pid_i-inum;
-      neigh_list=neigh_counts+stride;
+      stride=1;
+    	neigh_counts=host_numj+pid_i-inum;
+      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
    }
    
    // loop through neighbors
@ -203,13 +206,13 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
          int num_atom_cell = jcell_end - jcell_begin;
 	  
          // load jcell to shared memory
-          int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE);
+          int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);

          for (int k = 0; k < num_iter; k++) {
-            int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE);
+            int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
 	    
            if (tid < end_idx) {
-              pid_j =  cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin];
+              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
              cell_list_sh[tid] = pid_j;
              atom_j = fetch_pos(pid_j,pos); //[pid_j];
              pos_sh[tid].x = atom_j.x;
@ -222,20 +225,18 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 	    
              for (int j = 0; j < end_idx; j++) {
                int pid_j = cell_list_sh[j]; // gather from shared memory
-                if (pid_i<inum || pid_j<inum || pid_j>pid_i) {
-                  diff.x = atom_i.x - pos_sh[j].x;
-                  diff.y = atom_i.y - pos_sh[j].y;
-                  diff.z = atom_i.z - pos_sh[j].z;
+                diff.x = atom_i.x - pos_sh[j].x;
+                diff.y = atom_i.y - pos_sh[j].y;
+                diff.z = atom_i.z - pos_sh[j].z;
 		
-                  r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                  if (r2 < cell_size*cell_size && r2 > 1e-5) {
-                    if (cnt < neigh_bin_size) {
-                      *neigh_list = pid_j;
-                      neigh_list+=stride;
-                    }
-                    cnt++;
-                  }		
-                }
+                r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
+                if (r2 < cell_size*cell_size && r2 > 1e-5) {
+                  if (cnt < neigh_bin_size) {
+                    *neigh_list = pid_j;
+                    neigh_list+=stride;
+                  }
+                  cnt++;
+                }		
              }
            }
 	          __syncthreads();
@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 }

 __kernel void kernel_special(__global int *dev_nbor, 
-                             __global int *host_nbor_list, __global int *tag,
+                             __global int *host_nbor_list, 
+                             __global int *host_numj, __global int *tag,
                             __global int *nspecial, __global int *special,
-                             int inum, int nt, int nall) {
+                             int inum, int nt, int nall, int max_nbors) {
  // ii indexes the two interacting particles in gi
  int ii=GLOBAL_ID_X;

@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor,
    int n2=nspecial[ii*3+1];
    int n3=nspecial[ii*3+2];

+    int numj;
    if (ii < inum) {
      stride=inum;
      list=dev_nbor+stride+ii;
+      numj=*list;
+      list+=stride;
    } else {
-      stride=nt-inum;
-      list=host_nbor_list+ii-inum;
+      stride=1;
+      list=host_nbor_list+(ii-inum)*max_nbors;
+      numj=host_numj[ii-inum];
    }
-    int numj=*list;
-    list+=stride;
    list_end=list+numj*stride;
  
    for ( ; list<list_end; list+=stride) {
@ -294,4 +298,3 @@ __kernel void kernel_special(__global int *dev_nbor,
    }
  } // if ii
 }
-
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@ -19,13 +19,22 @@
 #include "pair_gpu_precision.h"
 #include <map>
 #include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef USE_OPENCL
+#include "pair_gpu_dev_cl.h"
+#else
+#include "pair_gpu_dev_ptx.h"
+#endif

 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>

 template <class numtyp, class acctyp>
 PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
                                  _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0) {
+                                  _last_device(0), _compiled(false) {
 }

 template <class numtyp, class acctyp>
@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() {
 }

 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
-                                 const int first_gpu, const int last_gpu,
-                                 const int gpu_mode, const double p_split,
-                                 const int nthreads) {
+int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
+                                const int first_gpu, const int last_gpu,
+                                const int gpu_mode, const double p_split,
+                                const int nthreads, const int t_per_atom) {
  _nthreads=nthreads;
+  #ifdef _OPENMP
+  omp_set_num_threads(nthreads);
+  #endif
+  _threads_per_atom=t_per_atom;
+  _threads_per_charge=t_per_atom;

  if (_device_init)
-    return true;
+    return 0;
  _device_init=true;
  _comm_world=world;
  _comm_replica=replica;
@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
  // set the device ID
  _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
                                       (last_gpu-first_gpu+1)));
-  int my_gpu=node_rank/_procs_per_gpu;
+  int my_gpu=node_rank/_procs_per_gpu+first_gpu;
+
+  // Time on the device only if 1 proc per gpu
+  _time_device=true;
+  if (_procs_per_gpu>1)
+    _time_device=false;
  
  // Set up a per device communicator
  MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,

  gpu=new UCL_Device();
  if (my_gpu>=gpu->num_devices())
-    return false;
+    return -2;
  
  gpu->set(my_gpu);
-  return true;
+
+  _long_range_precompute=0;
+
+  int flag=compile_kernels();
+
+  return flag;
 }

 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, 
-                          const int host_nlocal, const int nall,
-                          const int maxspecial, const bool gpu_nbor, 
-                          const int gpu_host, const int max_nbors, 
-                          const double cell_size, const bool pre_cut) {
+int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
+                         const bool rot, const int nlocal, 
+                         const int host_nlocal, const int nall,
+                         PairGPUNbor *nbor, const int maxspecial,
+                         const int gpu_host, const int max_nbors, 
+                         const double cell_size, const bool pre_cut) {
  if (!_device_init)
-    return false;                          
+    return -1;
+  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
+    return -5;
+
+  // Counts of data transfers for timing overhead estimates
+  _data_in_estimate=0;
+  _data_out_estimate=1;
+
+  // Initial number of local particles
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+
+  bool gpu_nbor=false;
+  if (_gpu_mode==GPU_NEIGH)
+    gpu_nbor=true;
+    
  if (_init_count==0) {
    // Initialize atom and nbor data
-    int ef_nlocal=nlocal;
-    if (_particle_split<1.0 && _particle_split>0.0)
-      ef_nlocal=static_cast<int>(_particle_split*nlocal);
-    if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
-                   gpu_nbor && maxspecial>0))
-      return false;
-    if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
-                   gpu_host,pre_cut))
-      return false;
-    nbor.cell_size(cell_size);
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
+      return -3;
+      
+    _data_in_estimate++;
+    if (charge)
+      _data_in_estimate++;
+    if (rot)
+      _data_in_estimate++;
  } else {
-    if (cell_size>nbor.cell_size())
-      nbor.cell_size(cell_size);
+    if (atom.charge()==false && charge)
+      _data_in_estimate++;
+    if (atom.quat()==false && rot)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
+      return -3;
  }
+  
+  if (!ans.init(ef_nlocal,charge,rot,*gpu))
+    return -3;
+
+  if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
+                  _block_cell_id, _block_nbor_build))
+    return -3;
+  nbor->cell_size(cell_size);

  _init_count++;
-  return true;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
+                         const int nall) {
+  if (!_device_init)
+    return -1;                          
+  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
+    return -5;
+
+  if (_init_count==0) {
+    // Initialize atom and nbor data
+    if (!atom.init(nall,true,false,*gpu,false,false))
+      return -3;
+  } else
+    if (!atom.add_fields(true,false,false,false))
+      return -3;
+
+  if (!ans.init(nlocal,true,false,*gpu))
+    return -3;
+
+  _init_count++;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::set_single_precompute
+                     (PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
+  _long_range_precompute=1;
+  pppm_single=pppm;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::set_double_precompute
+                     (PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
+  _long_range_precompute=2;
+  pppm_double=pppm;
 }

 template <class numtyp, class acctyp>
@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
    fprintf(screen,"\n-------------------------------------");
    fprintf(screen,"-------------------------------------\n");
    fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
-    fprintf(screen,"-  with %d procs per device.\n",_procs_per_gpu);
+    fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
+    #ifdef _OPENMP
+    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
+    #endif
    fprintf(screen,"-------------------------------------");
    fprintf(screen,"-------------------------------------\n");

-    for (int i=first_gpu; i<=last_gpu; i++) {
+    int last=last_gpu+1;
+    if (last>gpu->num_devices())
+      last=gpu->num_devices();
+    for (int i=first_gpu; i<last; i++) {
      std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
                        toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
                        " GHZ (";
@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
 }

 template <class numtyp, class acctyp>
-void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
-                                  const double max_bytes, FILE *screen) {
-  double single[5], times[5];
+void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, 
+                                           double &gpu_overhead,
+                                           double &gpu_driver_overhead) {
+  UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
+  UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
+  UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
+  UCL_Timer over_timer(*gpu);

-  single[0]=atom.transfer_time();
+  if (_data_in_estimate>0) {
+    host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
+    dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
+    timers_in=new UCL_Timer[_data_in_estimate];
+  }
+  
+  if (_data_out_estimate>0) {
+    host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
+    dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
+    timers_out=new UCL_Timer[_data_out_estimate];
+  }
+  
+  if (kernel_calls>0) {
+    kernel_data=new UCL_D_Vec<int>[kernel_calls];
+    timers_kernel=new UCL_Timer[kernel_calls];
+  }
+  
+  for (int i=0; i<_data_in_estimate; i++) {
+    host_data_in[i].alloc(1,*gpu);
+    dev_data_in[i].alloc(1,*gpu);
+    timers_in[i].init(*gpu);
+  }  
+  
+  for (int i=0; i<_data_out_estimate; i++) {
+    host_data_out[i].alloc(1,*gpu);
+    dev_data_out[i].alloc(1,*gpu);
+    timers_out[i].init(*gpu);
+  }  
+  
+  for (int i=0; i<kernel_calls; i++) {
+    kernel_data[i].alloc(1,*gpu);
+    timers_kernel[i].init(*gpu);
+  }  
+  
+  gpu_overhead=0.0;
+  gpu_driver_overhead=0.0;
+  
+  for (int i=0; i<10; i++) {
+    gpu->sync();
+    gpu_barrier();
+    over_timer.start();
+    gpu->sync();
+    gpu_barrier();
+
+    double driver_time=MPI_Wtime();
+    for (int i=0; i<_data_in_estimate; i++) {
+      timers_in[i].start();
+      ucl_copy(dev_data_in[i],host_data_in[i],true);
+      timers_in[i].stop();
+    }
+    
+    for (int i=0; i<kernel_calls; i++) {
+      timers_kernel[i].start();
+      zero(kernel_data[i],1);
+      timers_kernel[i].stop();
+    }
+
+    for (int i=0; i<_data_out_estimate; i++) {
+      timers_out[i].start();
+      ucl_copy(host_data_out[i],dev_data_out[i],true);
+      timers_out[i].stop();
+    }
+    over_timer.stop();
+
+    double time=over_timer.seconds();
+    driver_time=MPI_Wtime()-driver_time;
+     
+    if (time_device()) {
+      for (int i=0; i<_data_in_estimate; i++)
+        timers_in[i].add_to_total();
+      for (int i=0; i<kernel_calls; i++)
+        timers_kernel[i].add_to_total();
+      for (int i=0; i<_data_out_estimate; i++)
+        timers_out[i].add_to_total();
+    }
+    
+    double mpi_time, mpi_driver_time;
+    MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
+    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
+    gpu_overhead+=mpi_time;
+    gpu_driver_overhead+=mpi_driver_time;
+  }
+  gpu_overhead/=10.0;
+  gpu_driver_overhead/=10.0;
+
+  if (_data_in_estimate>0) {
+    delete [] host_data_in;
+    delete [] dev_data_in;
+    delete [] timers_in;
+  }
+  
+  if (_data_out_estimate>0) {
+    delete [] host_data_out;
+    delete [] dev_data_out;
+    delete [] timers_out;
+  }
+  
+  if (kernel_calls>0) {
+    delete [] kernel_data;
+    delete [] timers_kernel;
+  }
+}              
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::output_times(UCL_Timer &time_pair, 
+                                  PairGPUAns<numtyp,acctyp> &ans, 
+                                  PairGPUNbor &nbor, const double avg_split, 
+                                  const double max_bytes, 
+                                  const double gpu_overhead,
+                                  const double driver_overhead, 
+                                  const int threads_per_atom, FILE *screen) {
+  double single[8], times[8];
+
+  single[0]=atom.transfer_time()+ans.transfer_time();
  single[1]=nbor.time_nbor.total_seconds();
  single[2]=nbor.time_kernel.total_seconds();
  single[3]=time_pair.total_seconds();
-  single[4]=atom.cast_time();
+  single[4]=atom.cast_time()+ans.cast_time();
+  single[5]=gpu_overhead;
+  single[6]=driver_overhead;
+  single[7]=ans.cpu_idle_time();

-  MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);

-  double my_max_bytes=max_bytes;
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
  double mpi_max_bytes;
  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
  double max_mb=mpi_max_bytes/(1024.0*1024.0);

  if (replica_me()==0)
-    if (screen && times[3]>0.0) {
+    if (screen && times[5]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
      fprintf(screen,"      GPU Time Info (average): ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

-      if (procs_per_gpu()==1) {
+      if (time_device()) {
        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
      }
+      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
+      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
+
+      fprintf(screen,"-------------------------------------");
+      fprintf(screen,"--------------------------------\n\n");
+    }
+}
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, 
+                                         UCL_Timer &time_out,
+                                         UCL_Timer &time_map,
+                                         UCL_Timer &time_rho,
+                                         UCL_Timer &time_interp,
+                                         PairGPUAns<numtyp,acctyp> &ans, 
+                                         const double max_bytes, 
+                                         const double cpu_time, 
+                                         const double idle_time, FILE *screen) {
+  double single[8], times[8];
+
+  single[0]=time_out.total_seconds();
+  single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
+  single[2]=time_map.total_seconds();
+  single[3]=time_rho.total_seconds();
+  single[4]=time_interp.total_seconds();
+  single[5]=ans.transfer_time()+ans.cast_time();
+  single[6]=cpu_time;
+  single[7]=idle_time;
+
+  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
+  double mpi_max_bytes;
+  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
+  double max_mb=mpi_max_bytes/(1024.0*1024.0);
+
+  if (replica_me()==0)
+    if (screen && times[6]>0.0) {
+      fprintf(screen,"\n\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+
+      if (time_device()) {
+        fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
+        fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
+        fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
+        fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
+        fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
+        fprintf(screen,"Total rho:       %.4f s.\n",
+                (times[0]+times[2]+times[3])/_replica_size);
+        fprintf(screen,"Total interp:    %.4f s.\n",
+                (times[1]+times[4])/_replica_size);
+        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
+        fprintf(screen,"Total:           %.4f s.\n",
+                (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
+                _replica_size);
+      }
+      fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);

      fprintf(screen,"-------------------------------------");
@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear() {
  if (_init_count>0) {
+    _long_range_precompute=0;
    _init_count--;
    if (_init_count==0) {
      atom.clear();
-      nbor.clear();
+      _nbor_shared.clear();
+      if (_compiled) {
+        k_zero.clear();
+        k_info.clear();
+        delete dev_program;
+        _compiled=false;
+      }
    }
  }
 }
@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() {
  }
 }

+template <class numtyp, class acctyp>
+int PairGPUDeviceT::compile_kernels() {
+  int flag=0;
+
+  if (_compiled)
+  	return flag;
+  	
+  std::string flags="-cl-mad-enable";
+  dev_program=new UCL_Program(*gpu);
+  int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
+  if (success!=UCL_SUCCESS)
+    return -4;
+  k_zero.set_function(*dev_program,"kernel_zero");
+  k_info.set_function(*dev_program,"kernel_info");
+  _compiled=true;
+
+  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
+  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
+  k_info.set_size(1,1);
+  k_info.run(&d_gpu_lib_data.begin());
+  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
+  
+  #ifndef USE_OPENCL
+  if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
+    return -4;
+  #endif
+
+  _num_mem_threads=h_gpu_lib_data[1];
+  _warp_size=h_gpu_lib_data[2];
+  if (_threads_per_atom<1)
+    _threads_per_atom=h_gpu_lib_data[3];
+  if (_threads_per_charge<1)
+    _threads_per_charge=h_gpu_lib_data[13];
+  _pppm_max_spline=h_gpu_lib_data[4];
+  _pppm_block=h_gpu_lib_data[5];
+  _block_pair=h_gpu_lib_data[6];
+  _max_shared_types=h_gpu_lib_data[7];
+  _block_cell_2d=h_gpu_lib_data[8];
+  _block_cell_id=h_gpu_lib_data[9];
+  _block_nbor_build=h_gpu_lib_data[10];
+  _block_bio_pair=h_gpu_lib_data[11];
+  _max_bio_shared_types=h_gpu_lib_data[12];
+
+  if (static_cast<size_t>(_block_pair)>gpu->group_size())
+    _block_pair=gpu->group_size();
+  if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
+    _block_bio_pair=gpu->group_size();
+  if (_threads_per_atom>_warp_size)
+    _threads_per_atom=_warp_size;
+  if (_warp_size%_threads_per_atom!=0)
+    _threads_per_atom=1;
+  if (_threads_per_charge>_warp_size)
+    _threads_per_charge=_warp_size;
+  if (_warp_size%_threads_per_charge!=0)
+    _threads_per_charge=1;
+
+  return flag;    
+}
+
 template <class numtyp, class acctyp>
 double PairGPUDeviceT::host_memory_usage() const {
-  return atom.host_memory_usage()+
-         nbor.host_memory_usage()+4*sizeof(numtyp)+
+  return atom.host_memory_usage()+4*sizeof(numtyp)+
         sizeof(PairGPUDevice<numtyp,acctyp>);
 }

 template class PairGPUDevice<PRECISION,ACC_PRECISION>;
 PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;

-bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                     const int last_gpu, const int gpu_mode, 
-                     const double particle_split, const int nthreads) {
+int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                    const int last_gpu, const int gpu_mode, 
+                    const double particle_split, const int nthreads,
+                    const int t_per_atom) {
  return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                     particle_split,nthreads);
+                                     particle_split,nthreads,t_per_atom);
 }

 void lmp_clear_device() {
@ -264,14 +609,5 @@ void lmp_clear_device() {

 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                      double **vatom, double *virial, double &ecoul) {
-  if (pair_gpu_device.init_count()) {
-    pair_gpu_device.stop_host_timer();
-    pair_gpu_device.gpu->sync();
-    double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
-    pair_gpu_device.atom.get_answers(f,tor);
-
-    return evdw;
-  }
-  return 0.0;
+  return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
-
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@ -19,11 +19,17 @@
 #define PAIR_GPU_DEVICE_H

 #include "pair_gpu_atom.h"
+#include "pair_gpu_ans.h"
 #include "pair_gpu_nbor.h"
+#include "pppm_gpu_memory.h"
 #include "mpi.h"
 #include <sstream>
 #include "stdio.h"
 #include <string>
+#include <queue>
+
+template <class numtyp, class acctyp, 
+          class grdtyp, class grdtyp4> class PPPMGPUMemory;

 template <class numtyp, class acctyp>
 class PairGPUDevice {
@ -33,10 +39,15 @@ class PairGPUDevice {
 
  /// Initialize the device for use by this process
  /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
-  bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
+    * Returns:
+    * -  0 if successfull
+    * - -2 if GPU not found
+    * - -4 if GPU library not compiled for GPU **/
+  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                   const int last_gpu, const int gpu_mode, 
-                   const double particle_split, const int nthreads);
+                   const double particle_split, const int nthreads,
+                   const int t_per_atom);

  /// Initialize the device for Atom and Neighbor storage
  /** \param rot True if quaternions need to be stored
@ -50,19 +61,67 @@ class PairGPUDevice {
    * \param max_nbors Initial number of rows in the neighbor matrix
    * \param cell_size cutoff+skin 
    * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel **/
-  bool init(const bool charge, const bool rot, const int nlocal,
-            const int host_nlocal, const int nall, const int maxspecial, 
-            const bool gpu_nbor, const int gpu_host, const int max_nbors,
-            const double cell_size, const bool pre_cut);
+    *                than the force kernel 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
+           const int nlocal, const int host_nlocal, const int nall,
+           PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
+           const int max_nbors, const double cell_size, const bool pre_cut);
+
+  /// Initialize the device for Atom storage only
+  /** \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
                    const int first_gpu, const int last_gpu);

+  /// Perform charge assignment asynchronously for PPPM
+	void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         float,_lgpu_float4> *pppm);
+
+  /// Perform charge assignment asynchronously for PPPM
+	void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         double,_lgpu_double4> *pppm);
+
+  /// Esimate the overhead from GPU calls from multiple procs
+  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
+    *                     overhead
+    * \param gpu_overhead Estimated gpu overhead per timestep (sec)
+    * \param driver_overhead Estimated overhead from driver per timestep (s) **/
+  void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
+                             double &gpu_driver_overhead);
+
+  /// Returns true if double precision is supported on card
+  inline bool double_precision() { return gpu->double_precision(); }
+  
  /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, const double avg_split, 
-                    const double max_bytes, FILE *screen);
+  void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
+                    PairGPUNbor &nbor, const double avg_split, 
+                    const double max_bytes, const double gpu_overhead,
+                    const double driver_overhead, 
+                    const int threads_per_atom, FILE *screen);
+
+  /// Output a message with timing information
+  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
+                           UCL_Timer & time_map, UCL_Timer & time_rho,
+                           UCL_Timer &time_interp, 
+                           PairGPUAns<numtyp,acctyp> &ans, 
+                           const double max_bytes, const double cpu_time,
+                           const double cpu_idle_time, FILE *screen);

  /// Clear all memory on host and device associated with atom and nbor data
  void clear();
@ -70,11 +129,37 @@ class PairGPUDevice {
  /// Clear all memory on host and device
  void clear_device();

+  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
+  inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
+    { ans_queue.push(ans); }
+
+  /// Add "answers" (force,energies,etc.) into LAMMPS structures
+  inline double fix_gpu(double **f, double **tor, double *eatom,
+                        double **vatom, double *virial, double &ecoul) {
+    atom.data_unavail();
+    if (ans_queue.empty()==false) {
+      stop_host_timer();
+      double evdw=0.0;
+      while (ans_queue.empty()==false) {
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        ans_queue.pop();
+      }                                                 
+      return evdw;
+    }
+    return 0.0;
+  }
+
  /// Start timer on host
-  inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
+  inline void start_host_timer() 
+    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
  
  /// Stop timer on host
-  inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
+  inline void stop_host_timer() { 
+    if (_host_timer_started) {
+      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _host_timer_started=false;
+    }
+  }
  
  /// Return host time
  inline double host_time() { return _cpu_full; }
@ -114,6 +199,42 @@ class PairGPUDevice {
  inline double particle_split() const { return _particle_split; }
  /// Return the initialization count for the device
  inline int init_count() const { return _init_count; }
+  /// True if device is being timed
+  inline bool time_device() const { return _time_device; }
+
+  /// Return the number of threads accessing memory simulatenously
+  inline int num_mem_threads() const { return _num_mem_threads; }
+  /// Return the number of threads per atom for pair styles
+  inline int threads_per_atom() const { return _threads_per_atom; }
+  /// Return the number of threads per atom for pair styles using charge
+  inline int threads_per_charge() const { return _threads_per_charge; }
+  /// Return the min of the pair block size or the device max block size
+  inline int pair_block_size() const { return _block_pair; }
+  /// Return the maximum number of atom types that can be used with shared mem
+  inline int max_shared_types() const { return _max_shared_types; }
+  /// Return the maximum order for PPPM splines
+  inline int pppm_max_spline() const { return _pppm_max_spline; }
+  /// Return the block size for PPPM kernels
+  inline int pppm_block() const { return _pppm_block; }
+  /// Return the block size for neighbor binning
+  inline int block_cell_2d() const { return _block_cell_2d; }
+  /// Return the block size for atom mapping for neighbor builds
+  inline int block_cell_id() const { return _block_cell_id; }
+  /// Return the block size for neighbor build kernel
+  inline int block_nbor_build() const { return _block_nbor_build; }
+  /// Return the block size for "bio" pair styles
+  inline int block_bio_pair() const { return _block_bio_pair; }
+  /// Return the maximum number of atom types for shared mem with "bio" styles
+  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+
+  // -------------------- SHARED DEVICE ROUTINES -------------------- 
+  // Perform asynchronous zero of integer array 
+  void zero(UCL_D_Vec<int> &mem, const int numel) {
+    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
+                                    _block_pair));
+    k_zero.set_size(num_blocks,_block_pair);
+    k_zero.run(&mem.begin(),&numel);
+  }

  // -------------------------- DEVICE DATA ------------------------- 

@ -130,11 +251,30 @@ class PairGPUDevice {
  // --------------------------- NBOR DATA ----------------------------
  
  /// Neighbor Data
-  PairGPUNbor nbor;
+  PairGPUNborShared _nbor_shared;
+
+  // ------------------------ LONG RANGE DATA -------------------------
+  
+  // Long Range Data
+  int _long_range_precompute;
+  PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
+  PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
+  /// Precomputations for long range charge assignment (asynchronously)
+  inline void precompute(const int ago, const int nlocal, const int nall,
+                         double **host_x, int *host_type, bool &success,
+                         double *charge, double *boxlo, double *prd) {
+    if (_long_range_precompute==1)
+      pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+    else if (_long_range_precompute==2)
+      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+  }

 private:
+  std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
  int _init_count;
-  bool _device_init;
+  bool _device_init, _host_timer_started, _time_device;
  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
      _replica_size;
@ -142,6 +282,19 @@ class PairGPUDevice {
  double _particle_split;
  double _cpu_full;

+  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
+  int _pppm_max_spline, _pppm_block;
+  int _block_pair, _max_shared_types;
+  int _block_cell_2d, _block_cell_id, _block_nbor_build;
+  int _block_bio_pair, _max_bio_shared_types;
+
+  UCL_Program *dev_program;
+  UCL_Kernel k_zero, k_info;
+  bool _compiled;
+  int compile_kernels();
+
+  int _data_in_estimate, _data_out_estimate;
+  
  template <class t>
  inline std::string toa(const t& in) {
    std::ostringstream o;
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@ -18,15 +18,9 @@

 #include "pair_gpu_precision.h"
 #include "pair_gpu_nbor.h"
+#include "pair_gpu_device.h"
 #include "math.h"

-#ifdef USE_OPENCL
-#include "pair_gpu_nbor_cl.h"
-#else
-#include "pair_gpu_nbor_ptx.h"
-#include "pair_gpu_build_ptx.h"
-#endif
-
 int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
  if (_gpu_nbor)
    return (max_nbors+2)*sizeof(int);
@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
    return (max_nbors+3)*sizeof(int);
 }

-bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, 
+bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
+                       const int host_inum, const int max_nbors, 
                       const int maxspecial, UCL_Device &devi, 
                       const bool gpu_nbor, const int gpu_host, 
-                       const bool pre_cut) {
+                       const bool pre_cut, const int block_cell_2d,
+                       const int block_cell_id, const int block_nbor_build) {
  clear();

+  _block_cell_2d=block_cell_2d;
+  _block_cell_id=block_cell_id;
+  _block_nbor_build=block_nbor_build;
+  _shared=shared;
  dev=&devi;
  _gpu_nbor=gpu_nbor;
  if (gpu_host==0)
@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
    success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
                                          UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
  alloc(success);
+  if (!success)
+    return false;
+    
  if (_use_packing==false)
-    compile_kernels(devi);
+    _shared->compile_kernels(devi,gpu_nbor);

  return success;
 }
@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
 void PairGPUNbor::alloc(bool &success) { 
  dev_nbor.clear();
  host_acc.clear();
+  int nt=_max_atoms+_max_host;
  if (_use_packing==false || _gpu_nbor) 
    success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
  else 
    success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
-  success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
+  success=success && (host_acc.alloc(nt*2,*dev,
                                     UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);

  _c_bytes=dev_nbor.row_bytes();
@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) {
  if (_max_host>0) {
    host_nbor.clear();
    dev_host_nbor.clear();
-    success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
+    
+    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
                                        UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-    success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
+    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
                                            *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    _c_bytes+=dev_host_nbor.row_bytes();
+    success=success && (dev_host_numj.alloc(_max_host,*dev,
+                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
+    for (int i=0; i<nt; i++)
+      host_ilist[i]=i;
+    success=success && (host_jlist.alloc(_max_host,*dev,
+                                         UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
+    int *ptr=host_nbor.begin();
+    for (int i=0; i<_max_host; i++) {
+      host_jlist[i]=ptr;
+      ptr+=_max_nbors;
+    }                                                 
+    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
  }
  if (_maxspecial>0) {
    dev_nspecial.clear();
@ -145,6 +169,9 @@ void PairGPUNbor::clear() {
    dev_host_nbor.clear();
    dev_packed.clear();
    host_nbor.clear();
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
    dev_nspecial.clear();
    dev_special.clear();
    dev_special_t.clear();
@ -152,27 +179,13 @@ void PairGPUNbor::clear() {
    time_kernel.clear();
    time_nbor.clear();
  }
-
-  if (_compiled) {
-    if (_gpu_nbor) {
-      k_cell_id.clear();
-      k_cell_counts.clear();
-      k_build_nbor.clear();
-      k_transpose.clear();
-      k_special.clear();
-      delete build_program;
-    } else {
-      k_nbor.clear();
-      delete nbor_program;
-    }
-    _compiled=false;
-  }
 }

 double PairGPUNbor::host_memory_usage() const {
  if (_gpu_nbor) {
    if (_gpu_host)
-      return host_nbor.row_bytes()*host_nbor.rows();
+      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
+             host_jlist.row_bytes();
    else
      return 0;
  } else 
@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,

  UCL_H_Vec<int> ilist_view;
  ilist_view.view(ilist,inum,*dev);
-  ucl_copy(dev_nbor,ilist_view,true);
+  ucl_copy(dev_nbor,ilist_view,false);

  UCL_D_Vec<int> nbor_offset;
  UCL_H_Vec<int> host_offset;
@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
  if (_use_packing==false) {
    time_kernel.start();
    int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
-    k_nbor.set_size(GX,block_size);
-    k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
+    _shared->k_nbor.set_size(GX,block_size);
+    _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
    time_kernel.stop();
  }
 }

-void PairGPUNbor::compile_kernels(UCL_Device &dev) {
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
-
-  if (_gpu_nbor==false) {
-    nbor_program=new UCL_Program(dev);
-    nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
-    k_nbor.set_function(*nbor_program,"kernel_unpack");
-  } else {
-    build_program=new UCL_Program(dev);
-    #ifdef USE_OPENCL
-    std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
-    exit(1);
-    #else
-    build_program->load_string(pair_gpu_build_kernel,flags.c_str());
-    #endif
-    k_cell_id.set_function(*build_program,"calc_cell_id");
-    k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
-    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
-    k_transpose.set_function(*build_program,"transpose");
-    k_special.set_function(*build_program,"kernel_special");
-    neigh_tex.get_texture(*build_program,"neigh_tex");
-  }
-  _compiled=true;
-}
-
 template <class numtyp, class acctyp>
 void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
                                  const int nall, 
                                  PairGPUAtom<numtyp,acctyp> &atom, 
-                                  double *boxlo, double *boxhi, int *tag, 
+                                  double *sublo, double *subhi, int *tag, 
                                  int **nspecial, int **special, bool &success,
                                  int &mn) {
  const int nt=inum+host_inum;
-
  if (_maxspecial>0) {
    time_nbor.start();
    UCL_H_Vec<int> view_nspecial, view_special, view_tag;
@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
    time_nbor.stop();
    time_nbor.add_to_total();
    time_kernel.start();
-    const int b2x=8;
-    const int b2y=8;
+    const int b2x=_block_cell_2d;
+    const int b2y=_block_cell_2d;
    const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
    const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    k_transpose.set_size(g2x,g2y,b2x,b2y);
-    k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,
-                    &nt);        
+    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
+                             &_maxspecial,&nt);        
  } else
    time_kernel.start();

  _nbor_pitch=inum;
-  neigh_tex.bind_float(atom.dev_x,4);
+  _shared->neigh_tex.bind_float(atom.dev_x,4);

  int ncellx, ncelly, ncellz, ncell_3d;
-  ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +
+  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
                                  2.0*_cell_size)/_cell_size));
-  ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +
+  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
                                  2.0*_cell_size)/_cell_size));
-  ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +
+  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
                                  2.0*_cell_size)/_cell_size));
  ncell_3d = ncellx * ncelly * ncellz;
  UCL_D_Vec<int> cell_counts;
@ -316,35 +303,36 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
  _cell_bytes=cell_counts.row_bytes();

  /* build cell list on GPU */
-  const int neigh_block=128;
+  const int neigh_block=_block_cell_id;
  const int GX=(int)ceil((float)nall/neigh_block);
-  const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);
-  const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);
-  const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);
-  const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);
-  const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);
-  const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);
+  const numtyp sublo0=static_cast<numtyp>(sublo[0]);
+  const numtyp sublo1=static_cast<numtyp>(sublo[1]);
+  const numtyp sublo2=static_cast<numtyp>(sublo[2]);
+  const numtyp subhi0=static_cast<numtyp>(subhi[0]);
+  const numtyp subhi1=static_cast<numtyp>(subhi[1]);
+  const numtyp subhi2=static_cast<numtyp>(subhi[2]);
  const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
-  k_cell_id.set_size(GX,neigh_block);
-  k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
-                &atom.dev_particle_id.begin(),
-  				      &boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1, 
-  				      &boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
+  _shared->k_cell_id.set_size(GX,neigh_block);
+  _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
+                         &atom.dev_particle_id.begin(),
+  				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
+  				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);

  atom.sort_neighbor(nall);

  /* calculate cell count */
-  k_cell_counts.set_size(GX,neigh_block);
-  k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, 
-                    &ncell_3d);
+  _shared->k_cell_counts.set_size(GX,neigh_block);
+  _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), 
+                             &nall, &ncell_3d);

  /* build the neighbor list */
-  const int cell_block=64;
-  k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
-  k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
-                   &cell_counts.begin(), &dev_nbor.begin(),
-                   &dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,
-                   &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
+  const int cell_block=_block_nbor_build;
+  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
+  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
+                            &cell_counts.begin(), &dev_nbor.begin(),
+                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
+                            &_max_nbors,&cell_size_cast,
+                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);

  /* Get the maximum number of nbors and realloc if necessary */
  UCL_D_Vec<int> numj;
@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
  if (nt>inum) {
    UCL_H_Vec<int> host_offset;
    host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
+    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
  }
  mn=host_acc[0];
  for (int i=1; i<nt; i++)
@ -368,10 +356,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
    if (_max_host>0) {
      host_nbor.clear();
      dev_host_nbor.clear();
-      success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
+      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
                                          UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
+      success=success && (dev_host_nbor.alloc(mn*_max_host,
                                        dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
+      int *ptr=host_nbor.begin();
+      for (int i=0; i<_max_host; i++) {
+        host_jlist[i]=ptr;
+        ptr+=mn;
+      }                                                 
      _gpu_bytes+=dev_host_nbor.row_bytes();
    }
    if (_alloc_packed) {
@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
    _max_nbors=mn;
    time_kernel.stop();
    time_kernel.add_to_total();
-    build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,
+    build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
                    special, success, mn);
    return;
  }
  
  if (_maxspecial>0) {
    const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
-    k_special.set_size(GX2,cell_block);
-    k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
-                  &atom.dev_tag.begin(), &dev_nspecial.begin(), 
-                  &dev_special.begin(), &inum, &nt, &nall);
+    _shared->k_special.set_size(GX2,cell_block);
+    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
+                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
+                           &dev_nspecial.begin(), &dev_special.begin(), 
+                           &inum, &nt, &nall, &_max_nbors);
  }
  time_kernel.stop();

  time_nbor.start();
  if (_gpu_host)
-    ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
+    ucl_copy(host_nbor,dev_host_nbor,false);
  time_nbor.stop();
 }

 template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
-     (const int inum, const int host_inum, const int nall, 
-      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
+     (const int inum, const int host_inum, const int nall,
+      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
      int *, int **, int **, bool &success, int &mn);

--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@ -19,32 +19,27 @@
 #define PAIR_GPU_NBOR_H

 #include "pair_gpu_atom.h"
+#include "pair_gpu_nbor_shared.h"

 #define IJ_SIZE 131072

 #ifdef USE_OPENCL

-#include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
-#include "geryon/ocl_kernel.h"
-#include "geryon/ocl_texture.h"
 using namespace ucl_opencl;

 #else

-#include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
-#include "geryon/nvd_kernel.h"
-#include "geryon/nvd_texture.h"
 using namespace ucl_cudadr;

 #endif

 class PairGPUNbor {
 public:
-  PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {}
+  PairGPUNbor() : _allocated(false), _use_packing(false) {}
  ~PairGPUNbor() { clear(); }
 
  /// Determine whether neighbor unpacking should be used
@ -62,9 +57,11 @@ class PairGPUNbor {
    *                 2 if gpu_nbor is true, and host needs a full nbor list
    * \param pre_cut True if cutoff test will be performed in separate kernel
    *                than the force kernel **/
-  bool init(const int inum, const int host_inum, const int max_nbors, 
-            const int maxspecial, UCL_Device &dev, const bool gpu_nbor,
-            const int gpu_host, const bool pre_cut);
+  bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
+            const int max_nbors, const int maxspecial, UCL_Device &dev,
+            const bool gpu_nbor, const int gpu_host, const bool pre_cut,
+            const int block_cell_2d, const int block_cell_id, 
+            const int block_nbor_build);

  /// Set the size of the cutoff+skin
  inline void cell_size(const double size) { _cell_size=size; }
@ -131,18 +128,18 @@ class PairGPUNbor {
  inline int max_nbors() const { return _max_nbors; }

  /// Loop through neighbor count array and return maximum nbors for a particle
-  inline int max_nbor_loop(const int inum, int *numj) const {
+  inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
    int mn=0;
    for (int i=0; i<inum; i++)
-      mn=std::max(mn,numj[i]);
+      mn=std::max(mn,numj[ilist[i]]);
    return mn;
  }

  /// Build nbor list on the device
  template <class numtyp, class acctyp>
  void build_nbor_list(const int inum, const int host_inum, const int nall,
-                       PairGPUAtom<numtyp,acctyp> &atom, double *boxlo,
-                       double *boxhi, int *tag, int **nspecial, int **special, 
+                       PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
+                       double *subhi, int *tag, int **nspecial, int **special, 
                       bool &success, int &max_nbors);

  /// Return the number of bytes used on device
@ -176,31 +173,31 @@ class PairGPUNbor {
  UCL_H_Vec<int> host_nbor;
  /// Device storage for neighbor list matrix that will be copied to host
  /** - 1st row is numj
-    * - Remaining rows are nbors **/
+    * - Remaining rows are by atom, columns are nbors **/
  UCL_D_Vec<int> dev_host_nbor;
+  UCL_D_Vec<int> dev_host_numj;
+  UCL_H_Vec<int> host_ilist;
+  UCL_H_Vec<int*> host_jlist;
  /// Device storage for special neighbor counts
  UCL_D_Vec<int> dev_nspecial;
  /// Device storage for special neighbors
  UCL_D_Vec<int> dev_special, dev_special_t;
-  /// Texture for cached position/type access with CUDA
-  UCL_Texture neigh_tex;

  /// Device timers
  UCL_Timer time_nbor, time_kernel;
  
 private:
+  PairGPUNborShared *_shared;
  UCL_Device *dev;
-  UCL_Program *nbor_program, *build_program;
-  UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
-  UCL_Kernel k_transpose, k_special;
-  bool _allocated, _use_packing, _compiled;
-  void compile_kernels(UCL_Device &dev);
+  bool _allocated, _use_packing;
  int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
  bool _gpu_nbor, _gpu_host, _alloc_packed;
  double _cell_size;

  double _gpu_bytes, _c_bytes, _cell_bytes;
  void alloc(bool &success);
+  
+  int _block_cell_2d, _block_cell_id, _block_nbor_build;
 };

 #endif
--- a/lib/gpu/pair_gpu_precision.h
+++ b/lib/gpu/pair_gpu_precision.h
@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define acctyp4 _lgpu_float4
 #endif

-#define MAX_SHARED_TYPES 8
-#define MAX_BIO_SHARED_TYPES 128
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

 #endif