git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6340 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-06-08 20:24:29 +00:00 · 2011-06-08 20:24:29 +00:00 · 98ccbe3e69
parent 029f0ee874
commit 98ccbe3e69
24 changed files with 173 additions and 113 deletions
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -43,19 +43,22 @@ CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
        $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
 OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
-       $(OBJ_DIR)/pair_gpu_device.o \
-       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
+       $(OBJ_DIR)/charge_gpu_memory.o $(OBJ_DIR)/base_ellipsoid.o \
       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
-       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
+       $(OBJ_DIR)/gayberne.o $(OBJ_DIR)/gayberne_ext.o \
+       $(OBJ_DIR)/re_squared.o $(OBJ_DIR)/re_squared_ext.o \
       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/lj_class2_long.o $(OBJ_DIR)/lj_class2_long_ext.o \
       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
+       $(OBJ_DIR)/cmmc_msm_gpu_memory.o $(OBJ_DIR)/cmmc_msm_gpu.o \
       $(CUDPP)
 PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
       $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
@ -63,17 +66,21 @@ PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
       $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
       $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
       $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
-       $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
-       $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
+       $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h \
+       $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx \
+       $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/re_squared.ptx \
+       $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_ptx.h \
       $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
       $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
       $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
       $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
       $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
+       $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h \
       $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
       $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
       $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
-       $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
+       $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h \
+       $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h

 all: $(GPU_LIB) $(EXECS)

@ -140,6 +147,9 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(CUDR) -o $@ -c charge_gpu_memory.cpp

+$(OBJ_DIR)/base_ellipsoid.o: $(ALL_H) base_ellipsoid.h base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_ptx.h
+	$(CUDR) -o $@ -c base_ellipsoid.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu

@ -158,23 +168,41 @@ $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(O
 $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
 	$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
+$(OBJ_DIR)/ellipsoid_nbor.ptx: ellipsoid_nbor.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ ellipsoid_nbor.cu

-$(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu
+$(OBJ_DIR)/ellipsoid_nbor_ptx.h: $(OBJ_DIR)/ellipsoid_nbor.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ellipsoid_nbor.ptx $(OBJ_DIR)/ellipsoid_nbor_ptx.h

-$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h
-	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu
+$(OBJ_DIR)/gayberne.ptx: gayberne.cu pair_gpu_precision.h ellipsoid_extra.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ gayberne.cu

-$(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx
-	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h
+$(OBJ_DIR)/gayberne_lj.ptx: gayberne_lj.cu pair_gpu_precision.h ellipsoid_extra.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ gayberne_lj.cu

-$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h
-	$(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/gayberne_ptx.h: $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gayberne.ptx $(OBJ_DIR)/gayberne_lj.ptx $(OBJ_DIR)/gayberne_ptx.h

-$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
-	$(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/gayberne.o: $(ALL_H) gayberne.h gayberne.cpp $(OBJ_DIR)/gayberne_ptx.h $(OBJ_DIR)/base_ellipsoid.o
+	$(CUDR) -o $@ -c gayberne.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/gayberne.o gayberne_ext.cpp
+	$(CUDR) -o $@ -c gayberne_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/re_squared.ptx: re_squared.cu pair_gpu_precision.h ellipsoid_extra.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ re_squared.cu
+
+$(OBJ_DIR)/re_squared_lj.ptx: re_squared_lj.cu pair_gpu_precision.h ellipsoid_extra.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ re_squared_lj.cu
+
+$(OBJ_DIR)/re_squared_ptx.h: $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/re_squared.ptx $(OBJ_DIR)/re_squared_lj.ptx $(OBJ_DIR)/re_squared_ptx.h
+
+$(OBJ_DIR)/re_squared.o: $(ALL_H) re_squared.h re_squared.cpp $(OBJ_DIR)/re_squared_ptx.h $(OBJ_DIR)/base_ellipsoid.o
+	$(CUDR) -o $@ -c re_squared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/re_squared.o re_squared_ext.cpp
+	$(CUDR) -o $@ -c re_squared_ext.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu
@ -200,6 +228,18 @@ $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memor
 $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/lj_class2_long.ptx: lj_class2_long.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_class2_long.cu
+
+$(OBJ_DIR)/lj_class2_long_ptx.h: $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_class2_long.ptx $(OBJ_DIR)/lj_class2_long_ptx.h
+
+$(OBJ_DIR)/lj_class2_long.o: $(ALL_H) lj_class2_long.h lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(CUDR) -o $@ -c lj_class2_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_class2_long_ext.o: $(ALL_H) lj_class2_long.h lj_class2_long_ext.cpp charge_gpu_memory.h
+	$(CUDR) -o $@ -c lj_class2_long_ext.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu

@ -284,6 +324,18 @@ $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu
 $(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx: cmmc_msm_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_msm_gpu_kernel.cu
+
+$(OBJ_DIR)/cmmc_msm_gpu_ptx.h: $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_msm_gpu_kernel.ptx $(OBJ_DIR)/cmmc_msm_gpu_ptx.h
+
+$(OBJ_DIR)/cmmc_msm_gpu_memory.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu_memory.cpp $(OBJ_DIR)/cmmc_msm_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c cmmc_msm_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmmc_msm_gpu.o: $(ALL_H) cmmc_msm_gpu_memory.h cmmc_msm_gpu.cpp charge_gpu_memory.h
+	$(CUDR) -o $@ -c cmmc_msm_gpu.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) 

--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -33,25 +33,29 @@ ALL_H = $(OCL_H) $(PAIR_H)
 EXECS = $(BIN_DIR)/ocl_get_devices
 OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/pair_gpu_device.o \
-       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
+       $(OBJ_DIR)/charge_gpu_memory.o $(OBJ_DIR)/base_ellipsoid.o \
       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
-       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
+       $(OBJ_DIR)/gayberne.o $(OBJ_DIR)/gayberne_ext.o \
+       $(OBJ_DIR)/re_squared.o $(OBJ_DIR)/re_squared_ext.o \
       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/lj_class2_long.o $(OBJ_DIR)/lj_class2_long_ext.o \
       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
       $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
 KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
       $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
-       $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
+       $(OBJ_DIR)/ellipsoid_nbor_cl.h $(OBJ_DIR)/gayberne_cl.h \
+       $(OBJ_DIR)/re_squared_cl.h \
       $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
       $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
-       $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
+       $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/lj_class2_long_cl.h \
+       $(OBJ_DIR)/morse_gpu_cl.h \
       $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
       $(OBJ_DIR)/cmmc_long_gpu_cl.h 

@ -89,6 +93,9 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(OCL) -o $@ -c charge_gpu_memory.cpp

+$(OBJ_DIR)/base_ellipsoid.o: $(OCL_H) base_ellipsoid.h base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cl.h
+	$(OCL) -o $@ -c base_ellipsoid.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;

@ -98,20 +105,32 @@ $(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp  $(
 $(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
 	$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
-	$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
+$(OBJ_DIR)/ellipsoid_nbor_cl.h: ellipsoid_nbor.cu
+	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h

-$(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h
-	cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \
-	cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \
-	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \
-	rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar
+$(OBJ_DIR)/gayberne_cl.h: gayberne.cu gayberne_lj.cu ellipsoid_extra.h
+	cat ellipsoid_extra.h gayberne.cu > $(OBJ_DIR)/gayberne.tar; \
+	cat ellipsoid_extra.h gayberne_lj.cu > $(OBJ_DIR)/gayberne_lj.tar; \
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gayberne.tar $(OBJ_DIR)/gayberne_lj.tar $(OBJ_DIR)/gayberne_cl.h; \
+	rm -f $(OBJ_DIR)/gayberne.tar $(OBJ_DIR)/gayberne_lj.tar

-$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h
-	$(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/gayberne.o: $(ALL_H) gayberne.h gayberne.cpp $(OBJ_DIR)/gayberne_cl.h $(OBJ_DIR)/base_ellipsoid.o
+	$(OCL) -o $@ -c gayberne.cpp -I$(OBJ_DIR)

-$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
-	$(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
+$(OBJ_DIR)/gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/gayberne.o gayberne_ext.cpp
+	$(OCL) -o $@ -c gayberne_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/re_squared_cl.h: re_squared.cu re_squared_lj.cu ellipsoid_extra.h
+	cat ellipsoid_extra.h re_squared.cu > $(OBJ_DIR)/re_squared.tar; \
+	cat ellipsoid_extra.h re_squared_lj.cu > $(OBJ_DIR)/re_squared_lj.tar; \
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/re_squared.tar $(OBJ_DIR)/re_squared_lj.tar $(OBJ_DIR)/re_squared_cl.h; \
+	rm -f $(OBJ_DIR)/re_squared.tar $(OBJ_DIR)/re_squared_lj.tar
+
+$(OBJ_DIR)/re_squared.o: $(ALL_H) re_squared.h re_squared.cpp $(OBJ_DIR)/re_squared_cl.h $(OBJ_DIR)/base_ellipsoid.o
+	$(OCL) -o $@ -c re_squared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/re_squared.o re_squared_ext.cpp
+	$(OCL) -o $@ -c re_squared_ext.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h;
@ -134,12 +153,21 @@ $(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_g
 $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h;

-$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
+$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)

 $(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/lj_class2_long_cl.h: lj_class2_long.cu
+	$(BSH) ./geryon/file_to_cstr.sh lj_class2_long.cu $(OBJ_DIR)/lj_class2_long_cl.h;
+
+$(OBJ_DIR)/lj_class2_long.o: $(ALL_H) lj_class2_long.h lj_class2_long.cpp  $(OBJ_DIR)/lj_class2_long_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(OCL) -o $@ -c lj_class2_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_class2_long_ext.o: $(ALL_H) lj_class2_long.h lj_class2_long_ext.cpp charge_gpu_memory.h
+	$(OCL) -o $@ -c lj_class2_long_ext.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;

--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@ -241,7 +241,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global numtyp* sp_lj_in,__global int *dev_nbor,
                               __global int *dev_packed, __global acctyp4 *ans,
                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum, const int nall,
+                               const int vflag, const int inum,
                               const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@ -130,7 +130,6 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -141,15 +140,14 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@ -102,7 +102,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, __global numtyp *q_ ,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp g_ewald, const int t_per_atom) {
@ -295,10 +295,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald,
-                               const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@ -141,7 +141,6 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -152,7 +151,7 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
+                          &ainum, &nbor_pitch,
                          &this->atom->dev_q.begin(), &_cut_coulsq,
                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  } else {
@ -161,7 +160,7 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
                     &this->_threads_per_atom);
  }
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@ -103,12 +103,11 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global int *dev_nbor, __global int *dev_packed,
                          __global acctyp4 *ans, __global acctyp *engv, 
                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_, const numtyp cut_coulsq,
-                          const numtyp qqrd2e, const numtyp g_ewald,
-                          const numtyp denom_lj, const numtyp cut_bothsq, 
-                          const numtyp cut_ljsq, const numtyp cut_lj_innersq,
-                          const int t_per_atom) {
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const numtyp denom_lj,
+                          const numtyp cut_bothsq, const numtyp cut_ljsq,
+                          const numtyp cut_lj_innersq, const int t_per_atom) {

  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@ -300,7 +299,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
                               __global int *dev_packed, __global acctyp4 *ans,
                               __global acctyp *engv, const int eflag,
-                               const int vflag, const int inum, const int nall,
+                               const int vflag, const int inum,
                               const int nbor_pitch, __global numtyp *q_,
                               const numtyp cut_coulsq, const numtyp qqrd2e,
                               const numtyp g_ewald, const numtyp denom_lj,
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@ -145,7 +145,6 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -155,10 +154,9 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
-                          &_cut_ljsq, &_cut_lj_innersq, 
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq, 
                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
@ -166,7 +164,7 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &nbor_pitch, &this->atom->dev_q.begin(),
                     &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
                     &this->_threads_per_atom);
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@ -54,8 +54,11 @@ typedef struct _double4 double4;
 #define __kernel extern "C" __global__
 #define __local __shared__
 #define __global  
-#define __inline static __inline__ __device__ 
 #define atom_add atomicAdd

+#ifndef __inline
+#define __inline static __inline__ __device__ 
+#endif
+
 #endif

--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@ -234,8 +234,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@ -130,7 +130,6 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -141,15 +140,14 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@ -233,8 +233,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global int *dev_nbor, __global int *dev_packed, 
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@ -130,7 +130,6 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -141,15 +140,14 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@ -236,8 +236,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
--- a/lib/gpu/lj_expand_gpu_memory.cpp
+++ b/lib/gpu/lj_expand_gpu_memory.cpp
@ -130,7 +130,6 @@ void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -141,15 +140,14 @@ void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@ -94,7 +94,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, __global numtyp *q_ ,
                          __global numtyp *cutsq, const numtyp qqrd2e,
                          const int t_per_atom) {
@ -270,9 +270,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , __global numtyp *_cutsq,
-                               const numtyp qqrd2e, const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               __global numtyp *_cutsq, const numtyp qqrd2e,
+                               const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@ -142,7 +142,6 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -153,7 +152,7 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
+                          &ainum, &nbor_pitch,
                          &this->atom->dev_q.begin(), &cutsq.begin(),
                          &_qqrd2e, &this->_threads_per_atom);
  } else {
@ -162,7 +161,7 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &nbor_pitch, &this->atom->dev_q.begin(),
                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@ -102,7 +102,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag, 
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, __global numtyp *q_,
                          const numtyp cut_coulsq, const numtyp qqrd2e,
                          const numtyp g_ewald, const int t_per_atom) {
@ -286,10 +286,9 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald,
-                               const int t_per_atom) {
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@ -140,7 +140,6 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -151,18 +150,17 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch,
-                          &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->atom->dev_q.begin(),
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
+                          &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
-                     &this->_threads_per_atom);
+                     &nbor_pitch, &this->atom->dev_q.begin(), &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@ -84,7 +84,7 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
                          __global int *dev_packed, __global acctyp4 *ans,
                          __global acctyp *engv, const int eflag,
-                          const int vflag, const int inum, const int nall,
+                          const int vflag, const int inum,
                          const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
@ -234,8 +234,7 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
                               __global int *dev_nbor, __global int *dev_packed,
                               __global acctyp4 *ans, __global acctyp *engv, 
                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               const int t_per_atom) {
+                               const int nbor_pitch, const int t_per_atom) {
  int tid=THREAD_ID_X;
  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
  ii+=tid/t_per_atom;
--- a/lib/gpu/morse_gpu_memory.cpp
+++ b/lib/gpu/morse_gpu_memory.cpp
@ -129,7 +129,6 @@ void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                               (BX/this->_threads_per_atom)));

  int ainum=this->ans->inum();
-  int anall=this->atom->nall();
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
@ -140,15 +139,14 @@ void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
                          &this->_nbor_data->begin(),
                          &this->ans->dev_ans.begin(),
                          &this->ans->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch, 
-                          &this->_threads_per_atom);
+                          &ainum, &nbor_pitch, &this->_threads_per_atom);
  } else {
    this->k_pair.set_size(GX,BX);
    this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
                     &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch, &this->_threads_per_atom);
+                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
 }
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@ -253,7 +253,7 @@ __kernel void kernel_special(__global int *dev_nbor,
                             __global int *host_nbor_list, 
                             __global int *host_numj, __global int *tag,
                             __global int *nspecial, __global int *special,
-                             int inum, int nt, int nall, int max_nbors) {
+                             int inum, int nt, int max_nbors) {
  // ii indexes the two interacting particles in gi
  int ii=GLOBAL_ID_X;

--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@ -389,7 +389,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
                           &dev_nspecial.begin(), &dev_special.begin(), 
-                           &inum, &nt, &nall, &_max_nbors);
+                           &inum, &nt, &_max_nbors);
  }
  time_kernel.stop();

--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@ -49,7 +49,7 @@ __inline float fetch_q(const int& i, const float *q)
 // Allow PPPM to compile without atomics for NVIDIA 1.0 cards, error
 // generated at runtime with use of pppm/gpu
 #if (__CUDA_ARCH__ < 110)
-#define atom_add(x,y) 0
+#define atomicAdd(x,y) *(x)+=0
 #endif

 #else