Changes from Mike Brown.

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@5277 f3b2605a-c512-4ea7-a41b-209d697bcdaa
2010-11-23 00:40:35 +00:00 · 2010-11-23 00:40:35 +00:00 · 5a82c99485
parent ae536ce7d0
commit 5a82c99485
130 changed files with 24967 additions and 4802 deletions
--- a/lib/gpu/Makefile.cyg
+++ b/lib/gpu/Makefile.cyg
@ -1,72 +0,0 @@
-# /* ----------------------------------------------------------------------
-#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-#    http://lammps.sandia.gov, Sandia National Laboratories
-#    Steve Plimpton, sjplimp@sandia.gov
-# 
-#    Copyright (2003) Sandia Corporation.  Under the terms of Contract
-#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-#    certain rights in this software.  This software is distributed under 
-#    the GNU General Public License.
-# 
-#    See the README file in the top-level LAMMPS directory.
-# ------------------------------------------------------------------------- */
-# 
-# /* ----------------------------------------------------------------------
-#    Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-#                          Peng Wang (Nvidia), penwang@nvidia.com
-#                          Paul Crozier (SNL), pscrozi@sandia.gov
-# ------------------------------------------------------------------------- */
-
-BIN_DIR = .
-OBJ_DIR = .
-AR = ar
-CUDA_CPP  = /cygdrive/c/CUDA/bin/nvcc -I/cygdrive/c/CUDA/include -O3 -DWINDLL -DUNIX -Xptxas -v --use_fast_math
-CUDA_ARCH = -arch=sm_13
-CUDA_PREC = -D_SINGLE_SINGLE
-CUDA_LINK = -L/cygdrive/c/CUDA/lib -lcudart $(CUDA_LIB)
-
-CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
-
-CUDA_LIB = $(OBJ_DIR)/gpu.dll
-
-# Headers for CUDA Stuff
-NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
-# Headers for Pair Stuff
-PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
-# Dependencies for the Texture Tar
-TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
-        lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
-        gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
-
-ALL_H = $(NVC_H) $(PAIR_H)
-
-EXECS = $(BIN_DIR)/nvc_get_devices
-OBJS = $(OBJ_DIR)/nvc_device.obj $(OBJ_DIR)/pair_gpu_nbor.obj \
-       $(OBJ_DIR)/pair_tex_tar.obj $(OBJ_DIR)/pair_gpu_cell.obj 
-
-all: $(CUDA_LIB) $(EXECS)
-
-$(OBJ_DIR)/nvc_device.obj : nvc_device.cu $(NVC_H)
-	$(CUDA) -o $@ -c nvc_device.cu
-
-$(OBJ_DIR)/pair_gpu_nbor.obj: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
-	$(CUDA) -o $@ -c pair_gpu_nbor.cu
-
-$(OBJ_DIR)/pair_tex_tar.obj: $(TAR_H)
-	$(CUDA) -o $@ -c pair_tex_tar.cu
-
-$(OBJ_DIR)/pair_gpu_cell.obj: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
-	$(CUDA) -o $@ -c pair_gpu_cell.cu
-
-$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.obj 
-	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.obj 
-
-$(CUDA_LIB): $(OBJS) $(TAR_H)
-	$(CUDA) -o $@ -shared $(OBJS)
-	
-clean:
-	rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.exe *.exp *.lib *.dll *.linkinfo
-
-veryclean: clean
-	rm -rf *~ *.linkinfo
-
--- a/lib/gpu/Makefile.fermi
+++ b/lib/gpu/Makefile.fermi
@ -0,0 +1,39 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+CUDA_HOME = $(HOME)/cuda
+NVCC  = $(CUDA_HOME)/bin/nvcc 
+
+CUDA_ARCH = -arch=sm_13
+CUDA_PRECISION = -D_SINGLE_DOUBLE
+CUDA_INCLUDE = -I$(CUDA_HOME)/include 
+CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
+
+CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include 
+CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
+
+BIN_DIR = ./
+OBJ_DIR = ./obj
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Nvidia.makefile
+
--- a/lib/gpu/Makefile.lens
+++ b/lib/gpu/Makefile.lens
@ -0,0 +1,39 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
+NVCC = nvcc
+
+CUDA_ARCH = -arch=sm_13
+CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_INCLUDE = -I$(CUDA_HOME)/include
+CUDA_LIB = -L$(CUDA_HOME)/lib64
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
+
+CUDR_CPP = mpic++ -DMPI_GERYON
+CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
+
+BIN_DIR = ./
+OBJ_DIR = ./obj
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Nvidia.makefile
+
--- a/lib/gpu/Makefile.lincoln
+++ b/lib/gpu/Makefile.lincoln
@ -0,0 +1,36 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+# Makefile for NCSA's lincoln GPU cluster. Tested with "soft +cuda-2.3"
+# ------------------------------------------------------------------------- */
+
+CUDA_HOME = /usr/local/cuda-2.3
+NVCC = $(CUDA_HOME)/bin/nvcc
+
+CUDA_ARCH = -arch=sm_13
+CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_INCLUDE = -I$(CUDA_HOME)/include
+CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
+
+CUDR_CPP = mpic++ -DMPI_GERYON
+CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
+
+BIN_DIR = ./
+OBJ_DIR = ./obj
+LIB_DIR = ./
+AR = ar
+
+include Nvidia.makefile
+
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@ -0,0 +1,39 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+CUDA_HOME = /usr/local/cuda
+NVCC = nvcc
+
+CUDA_ARCH = -arch=sm_13
+CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_INCLUDE = -I$(CUDA_HOME)/include
+CUDA_LIB = -L$(CUDA_HOME)/lib64
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
+
+CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
+
+BIN_DIR = ./
+OBJ_DIR = ./obj
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Nvidia.makefile
+
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@ -0,0 +1,31 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+OCL_LINK = -lOpenCL
+OCL_PREC = -D_SINGLE_SINGLE
+
+BIN_DIR = ./
+OBJ_DIR = ./ocl_obj
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
+
--- a/lib/gpu/Makefile.longhorn
+++ b/lib/gpu/Makefile.longhorn
@ -0,0 +1,35 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#  Makefile for the TACC longhorn cluster. Use "module load cuda".
+# ------------------------------------------------------------------------- */
+
+CUDA_HOME = $(TACC_CUDA_DIR)
+NVCC = nvcc
+CUDA_ARCH = -arch=sm_13
+CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_INCLUDE = -I$(CUDA_HOME)/include
+CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
+
+CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
+
+BIN_DIR = ./
+OBJ_DIR = ./obj
+LIB_DIR = ./
+AR = ar
+
+include Nvidia.makefile
+
--- a/lib/gpu/Makefile.mac
+++ b/lib/gpu/Makefile.mac
@ -0,0 +1,39 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+CUDA_HOME = /usr/local/cuda
+NVCC = nvcc
+
+CUDA_ARCH = -arch=sm_11
+CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_INCLUDE = -I$(CUDA_HOME)/include
+CUDA_LIB = -L$(CUDA_HOME)/lib
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
+
+CUDR_CPP = mpic++
+CUDR_OPTS = -O2 -m32 -g
+
+BIN_DIR = ./
+OBJ_DIR = ./obj
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Nvidia.makefile
+
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@ -0,0 +1,31 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
+OCL_LINK = -framework OpenCL
+OCL_PREC = -D_SINGLE_SINGLE
+
+BIN_DIR = ./
+OBJ_DIR = ./ocl_obj
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
+
--- a/lib/gpu/Makefile.nvidia
+++ b/lib/gpu/Makefile.nvidia
@ -1,72 +0,0 @@
-# /* ----------------------------------------------------------------------   
-#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
-#    http://lammps.sandia.gov, Sandia National Laboratories                   
-#    Steve Plimpton, sjplimp@sandia.gov                                       
-#                                                                             
-#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
-#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
-#    certain rights in this software.  This software is distributed under      
-#    the GNU General Public License.                                          
-#                                                                             
-#    See the README file in the top-level LAMMPS directory.                   
-# ------------------------------------------------------------------------- */
-#                                                                             
-# /* ----------------------------------------------------------------------   
-#    Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov               
-#                          Peng Wang (Nvidia), penwang@nvidia.com             
-#                          Paul Crozier (SNL), pscrozi@sandia.gov             
-# ------------------------------------------------------------------------- */
-
-BIN_DIR = .
-OBJ_DIR = .
-AR = ar
-CUDA_CPP  = nvcc -I/usr/local/cuda/include -DUNIX -O3 -Xptxas -v --use_fast_math
-CUDA_ARCH = -arch=sm_13
-CUDA_PREC = -D_SINGLE_SINGLE
-CUDA_LINK = -L/usr/local/cuda/lib -lcudart $(CUDA_LIB)
-
-CUDA = $(CUDA_CPP) $(CUDA_ARCH) $(CUDA_PREC)
-
-CUDA_LIB = $(OBJ_DIR)/libgpu.a
-
-# Headers for CUDA Stuff
-NVC_H  = nvc_macros.h nvc_device.h nvc_timer.h nvc_memory.h nvc_traits.h
-# Headers for Pair Stuff
-PAIR_H  = pair_gpu_texture.h pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_cell.h
-# Dependencies for the Texture Tar
-TAR_H = $(NVC_H) $(PAIR_H) pair_gpu_atom.cu lj_gpu_memory.h lj_gpu_memory.cu \
-        lj_gpu_kernel.h lj_gpu.cu gb_gpu_memory.h gb_gpu_memory.cu \
-        gb_gpu_extra.h gb_gpu_kernel.h gb_gpu.cu
-
-ALL_H = $(NVC_H) $(PAIR_H)
-
-EXECS = $(BIN_DIR)/nvc_get_devices
-OBJS = $(OBJ_DIR)/nvc_device.o $(OBJ_DIR)/pair_gpu_nbor.cu_o \
-       $(OBJ_DIR)/pair_tex_tar.cu_o $(OBJ_DIR)/pair_gpu_cell.cu_o 
-
-all: $(CUDA_LIB) $(EXECS)
-
-$(OBJ_DIR)/nvc_device.o: nvc_device.cu $(NVC_H)
-	$(CUDA) -o $@ -c nvc_device.cu
-
-$(OBJ_DIR)/pair_gpu_nbor.cu_o: pair_gpu_nbor.cu pair_gpu_texture.h pair_gpu_nbor.h $(NVC_H)
-	$(CUDA) -o $@ -c pair_gpu_nbor.cu
-
-$(OBJ_DIR)/pair_tex_tar.cu_o: $(TAR_H)
-	$(CUDA) -o $@ -c pair_tex_tar.cu
-
-$(OBJ_DIR)/pair_gpu_cell.cu_o: pair_gpu_cell.cu pair_gpu_cell.h lj_gpu_memory.h
-	$(CUDA) -o $@ -c pair_gpu_cell.cu
-
-$(BIN_DIR)/nvc_get_devices: nvc_get_devices.cu $(NVC_H) $(OBJ_DIR)/nvc_device.o
-	$(CUDA) -o $@ nvc_get_devices.cu $(CUDALNK) $(OBJ_DIR)/nvc_device.o 
-
-$(CUDA_LIB): $(OBJS)
-	$(AR) -crusv $(CUDA_LIB) $(OBJS)
-
-clean:
-	rm -rf $(EXECS) $(CUDA_LIB) $(OBJS) *.linkinfo
-
-veryclean: clean
-	rm -rf *~ *.linkinfo
-
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -0,0 +1,218 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+CUDA  = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
+             $(CUDA_PRECISION)
+CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
+        -Icudpp_mini
+CUDA_LINK = $(CUDA_LIB) -lcudart
+
+GPU_LIB = $(LIB_DIR)/libgpu.a
+
+# Headers for Geryon
+UCL_H  = $(wildcard ./geryon/ucl*.h)
+NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) 
+# Headers for Pair Stuff
+PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
+          pair_gpu_device.h pair_gpu_balance.h
+
+ALL_H = $(NVD_H) $(PAIR_H)
+
+EXECS = $(BIN_DIR)/nvc_get_devices
+CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
+        $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
+        $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
+       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
+       $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
+       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
+       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
+       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
+       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
+       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
+       $(CUDPP)
+PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
+       $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
+       $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
+       $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
+       $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
+       $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
+       $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
+       $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
+       $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
+       $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
+       $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h 
+
+all: $(GPU_LIB) $(EXECS)
+
+$(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini
+
+$(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini
+
+$(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini
+
+$(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
+
+$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
+
+$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
+
+$(OBJ_DIR)/pair_gpu_atom_kernel.ptx: pair_gpu_atom_kernel.cu
+	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_atom_kernel.cu
+
+$(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h
+
+$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
+	$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
+	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
+
+$(OBJ_DIR)/pair_gpu_nbor_ptx.h: $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h
+
+$(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
+	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_build_kernel.cu
+
+$(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
+
+$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_device.cpp
+
+$(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
+	$(CUDR) -o $@ -c atomic_gpu_memory.cpp
+
+$(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
+	$(CUDR) -o $@ -c charge_gpu_memory.cpp
+
+$(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
+
+$(OBJ_DIR)/gb_gpu_kernel_lj.ptx: gb_gpu_kernel_lj.cu pair_gpu_precision.h gb_gpu_extra.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_lj.cu
+
+$(OBJ_DIR)/gb_gpu_kernel_nbor.ptx: gb_gpu_kernel_nbor.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel_nbor.cu
+
+$(OBJ_DIR)/gb_gpu_ptx.h: $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h
+
+$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_ptx.h
+	$(CUDR) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
+	$(CUDR) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_cut_gpu_kernel.ptx: lj_cut_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_cut_gpu_kernel.cu
+
+$(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h
+
+$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+	$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ ljc_cut_gpu_kernel.cu
+
+$(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h
+
+$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+	$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ ljcl_cut_gpu_kernel.cu
+
+$(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h
+
+$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+	$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ lj96_cut_gpu_kernel.cu
+
+$(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h
+
+$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+	$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
+
+$(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h
+
+$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+	$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ cmmc_long_gpu_kernel.cu
+
+$(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
+
+$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+	$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
+
+$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
+	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDART $(CUDA_LINK) 
+
+$(GPU_LIB): $(OBJS)
+	$(AR) -crusv $(GPU_LIB) $(OBJS)
+
+clean:
+	rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(PTXS) *.linkinfo
+
+veryclean: clean
+	rm -rf *~ *.linkinfo
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -0,0 +1,155 @@
+# /* ----------------------------------------------------------------------   
+#    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator       
+#    http://lammps.sandia.gov, Sandia National Laboratories                   
+#    Steve Plimpton, sjplimp@sandia.gov                                       
+#                                                                             
+#    Copyright (2003) Sandia Corporation.  Under the terms of Contract        
+#    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains   
+#    certain rights in this software.  This software is distributed under      
+#    the GNU General Public License.                                          
+#                                                                             
+#    See the README file in the top-level LAMMPS directory.                   
+# ------------------------------------------------------------------------- */
+#                                                                             
+# /* ----------------------------------------------------------------------   
+#    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
+#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Paul Crozier (SNL), pscrozi@sandia.gov             
+# ------------------------------------------------------------------------- */
+
+OCL  = $(OCL_CPP) $(OCL_PREC) -DUSE_OPENCL
+OCL_LIB = $(LIB_DIR)/libgpu.a
+# Headers for Geryon
+UCL_H  = $(wildcard ./geryon/ucl*.h)
+OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
+# Headers for Pair Stuff
+PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
+          pair_gpu_device.h pair_gpu_balance.h
+
+ALL_H = $(OCL_H) $(PAIR_H)
+
+EXECS = $(BIN_DIR)/ocl_get_devices
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
+       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
+       $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
+       $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
+       $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
+       $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
+       $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
+       $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
+KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
+       $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
+       $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
+       $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
+       $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h 
+       
+OCL_EXECS = $(BIN_DIR)/ocl_get_devices
+
+all: $(OCL_LIB) $(EXECS)
+
+$(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_atom_kernel.cu $(OBJ_DIR)/pair_gpu_atom_cl.h
+
+$(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
+	$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
+
+$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
+	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
+	$(OCL) -o $@ -c pair_gpu_device.cpp
+
+$(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
+	$(OCL) -o $@ -c atomic_gpu_memory.cpp
+
+$(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
+	$(OCL) -o $@ -c charge_gpu_memory.cpp
+
+$(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
+	$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
+
+$(OBJ_DIR)/gb_gpu_cl.h: gb_gpu_kernel.cu gb_gpu_kernel_lj.cu gb_gpu_extra.h
+	cat gb_gpu_extra.h gb_gpu_kernel.cu > $(OBJ_DIR)/gb_gpu_kernel.tar; \
+	cat gb_gpu_extra.h gb_gpu_kernel_lj.cu > $(OBJ_DIR)/gb_gpu_kernel_lj.tar; \
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar $(OBJ_DIR)/gb_gpu_cl.h; \
+	rm -f $(OBJ_DIR)/gb_gpu_kernel.tar $(OBJ_DIR)/gb_gpu_kernel_lj.tar
+
+$(OBJ_DIR)/gb_gpu_memory.o: $(ALL_H) gb_gpu_memory.h gb_gpu_memory.cpp $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h
+	$(OCL) -o $@ -c gb_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gb_gpu.o: $(ALL_H) gb_gpu_memory.h gb_gpu.cpp
+	$(OCL) -o $@ -c gb_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh lj_cut_gpu_kernel.cu $(OBJ_DIR)/lj_cut_gpu_cl.h;
+
+$(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp  $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+	$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh ljc_cut_gpu_kernel.cu $(OBJ_DIR)/ljc_cut_gpu_cl.h;
+
+$(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp  $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+	$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh ljcl_cut_gpu_kernel.cu $(OBJ_DIR)/ljcl_cut_gpu_cl.h;
+
+$(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
+	$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh lj96_cut_gpu_kernel.cu $(OBJ_DIR)/lj96_cut_gpu_cl.h;
+
+$(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp  $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+	$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
+
+$(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp  $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+	$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh cmmc_long_gpu_kernel.cu $(OBJ_DIR)/cmmc_long_gpu_cl.h;
+
+$(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp  $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+	$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
+
+$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
+	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 
+
+$(OCL_LIB): $(OBJS) $(PTXS)
+	$(AR) -crusv $(OCL_LIB) $(OBJS)
+
+opencl: $(OCL_EXECS)
+
+clean:
+	rm -rf $(EXECS) $(OCL_EXECS) $(OCL_LIB) $(OBJS) $(KERS) *.linkinfo
+
+veryclean: clean
+	rm -rf *~ *.linkinfo
+
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
@ -20,57 +20,91 @@
                          GENERAL NOTES
                          
 This library, libgpu.a, provides routines for GPU acceleration
-of LAMMPS pair styles.  Currently, only CUDA enabled GPUs are
-supported.  Compilation of this library requires installing the CUDA
-GPU driver and CUDA toolkit for your operating system. In addition to
-the LAMMPS library, the binary nvc_get_devices will also be
-built. This can be used to query the names and properties of GPU
-devices on your system.
+of LAMMPS pair styles. Compilation of this library requires 
+installing the CUDA GPU driver and CUDA toolkit for your operating 
+system. In addition to the LAMMPS library, the binary nvc_get_devices 
+will also be built. This can be used to query the names and 
+properties of GPU devices on your system. A Makefile for OpenCL 
+compilation is provided, but support for OpenCL use is not currently
+provided by the developers.

 NOTE: Installation of the CUDA SDK is not required.

 Current pair styles supporting GPU acceleration:

  1. lj/cut/gpu
-  2. gayberne/gpu
+  2. lj/cut/coul/cut/gpu
+  3. lj/cut/coul/long/gpu
+  4. lj96/cut/gpu
+  5. gayberne/gpu
+  6. cmm/cg/gpu
+  7. cmm/cg/coul/long/gpu

                     MULTIPLE LAMMPS PROCESSES
                     
-When using GPU acceleration, you are restricted to one physical GPU
-per LAMMPS process. This can be multiple GPUs on a single node or
-across multiple nodes. Intructions on GPU assignment can be found in
-the LAMMPS documentation.
-
-                            SPEEDUPS
-
-The speedups that can be obtained using this library are highly
-dependent on the GPU architecture and the computational expense of the
-pair potential. When comparing a single precision Tesla C1060 run to a
-serial Intel Xeon 5140 2.33 GHz serial run, the speedup is ~4.42x for
-lj/cut with a cutoff of 2.5. For gayberne with a cutoff of 7, the
-speedup is >103x for 8000 particles. The speedup will improve with an
-increase in the number of particles or an increase in the cutoff.
+Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
+GPUs cannot be utilized by a single MPI process. In many cases, the
+best performance will be obtained by running as many MPI processes as
+CPU cores available with the condition that the number of MPI processes
+is an integer multiple of the number of GPUs being used. See the 
+LAMMPS user manual for details on running with GPU acceleration.

                    BUILDING AND PRECISION MODES

-To build, edit the CUDA_CPP, CUDA_ARCH, CUDA_PREC, and CUDA_LINK files for
-your machine. Type make. Additionally, the GPU package must be installed and
-compiled for LAMMPS. The library supports 3 precision modes as determined by 
-the CUDA_PREC variable:
+To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME, NVCC, CUDA_INCLUD,
+CUDA_LIB and CUDA_OPTS variables in one of the Makefiles. CUDA_ARCH should
+be set based on the compute capability of your GPU. This can be verified by
+running the nvc_get_devices executable after the build is complete.
+Additionally, the GPU package must be installed and compiled for LAMMPS.
+This may require editing the gpu_SYSPATH variable in the LAMMPS makefile.
+
+Please note that the GPU library accesses the CUDA driver library directly,
+so it needs to be linked not only to the CUDA runtime library (libcudart.so)
+that ships with the CUDA toolkit, but also with the CUDA driver library
+(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
+on the head node of a GPU cluster, this library may not be installed,
+so you may need to copy it over from one of the compute nodes (best into
+this directory).
+
+The gpu library supports 3 precision modes as determined by 
+the CUDA_PRECISION variable:

  CUDA_PREC = -D_SINGLE_SINGLE  # Single precision for all calculations
  CUDA_PREC = -D_DOUBLE_DOUBLE  # Double precision for all calculations
  CUDA_PREC = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double

-NOTE: For the lj/cut pair style, only single precision will be used, even 
-      if double precision is specified.
-  
-NOTE: Double precision is only supported on certain GPUS (with
+NOTE: Double precision is only supported on certain GPUs (with
      compute capability>=1.3).
      
 NOTE: For Tesla and other graphics cards with compute capability>=1.3,
      make sure that -arch=sm_13 is set on the CUDA_ARCH line.

+NOTE: For Fermi, make sure that -arch=sm_20 is set on the CUDA_ARCH line.
+
 NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
      package has been installed before installing the GPU package in LAMMPS.
-      
+
+NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
+      installed if the USER-CG-CMM package has been installed before
+      installing the GPU package in LAMMPS.
+
+NOTE: The lj/cut/coul/long/gpu and cg/cmm/coul/long/gpu style will only be 
+      installed if the KSPACE package has been installed before installing
+      the GPU package in LAMMPS.
+
+                      EXAMPLE BUILD PROCESS
+                    
+cd ~/lammps/lib/gpu
+emacs Makefile.linux
+make -f Makefile.linux
+./nvc_get_devices
+cd ../../src
+emacs ./MAKE/Makefile.linux
+make yes-asphere
+make yes-kspace
+make yes-gpu
+make linux
+
+------------------------------------------------------------------------
+Last merge with gpulammps: r561 on 2010-11-12
+------------------------------------------------------------------------
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@ -0,0 +1,262 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include "atomic_gpu_memory.h"
+#define AtomicGPUMemoryT AtomicGPUMemory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0)  {
+  device=&pair_gpu_device;
+}
+
+template <class numtyp, class acctyp>
+AtomicGPUMemoryT::~AtomicGPUMemory() {
+}
+
+template <class numtyp, class acctyp>
+int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                   const int max_nbors, const int maxspecial,
+                                   const double cell_size,
+                                   const double gpu_split, FILE *_screen,
+                                   const char *pair_program) {
+  nbor_time_avail=false;
+  screen=_screen;
+
+  bool gpu_nbor=false;
+  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=true;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
+                    _gpu_host,max_nbors,cell_size,false))
+    return false;
+  ucl_device=device->gpu;
+  atom=&device->atom;
+  nbor=&device->nbor;
+
+  _block_size=BLOCK_1D;
+  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
+    _block_size=ucl_device->group_size();
+  compile_kernels(*ucl_device,pair_program);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+
+  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void AtomicGPUMemoryT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+
+  if (_compiled) {
+    k_pair_fast.clear();
+    k_pair.clear();
+    delete pair_program;
+    _compiled=false;
+  }
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  nbor_time_avail=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return false;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+  
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
+                                              const int host_inum,
+                                              const int nall, double **host_x,
+                                              int *host_type, double *boxlo,
+                                              double *boxhi, int *tag,
+                                              int **nspecial, int **special,
+                                              bool &success) {
+  nbor_time_avail=true;
+
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+                        nspecial, special, success, mn);
+
+  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
+			      const int inum_full, const int nall,
+                              double **host_x, int *host_type,
+                              int *ilist, int *numj, int **firstneigh,
+                              const bool eflag, const bool vflag,
+                              const bool eatom, const bool vatom,
+                              int &host_start, const double cpu_time,
+                              bool &success) {
+  acc_timers();
+  if (inum_full==0) {
+    zero_timers();
+    return;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
+		               nbor->gpu_nbor());
+  atom->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+
+  loop(eflag,vflag);
+  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
+                                const int inum_full, const int nall,
+                                double **host_x, int *host_type, double *boxlo,
+                                double *boxhi, int *tag, int **nspecial,
+                                int **special, const bool eflag, 
+                                const bool vflag, const bool eatom,
+                                const bool vatom, int &host_start,
+                                const double cpu_time, bool &success) {
+  acc_timers();
+  if (inum_full==0) {
+    zero_timers();
+    return NULL;
+  }
+  
+  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
+  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
+  atom->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    boxlo, boxhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+
+  loop(eflag,vflag);
+  atom->copy_answers(eflag,vflag,eatom,vatom);
+  hd_balancer.stop_timer();
+  
+  return device->nbor.host_nbor.begin();
+}
+
+template <class numtyp, class acctyp>
+double AtomicGPUMemoryT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+
+         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
+         sizeof(AtomicGPUMemory<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void AtomicGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(pair_str,flags.c_str());
+  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
+  k_pair.set_function(*pair_program,"kernel_pair");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+
+  _compiled=true;
+}
+
+template class AtomicGPUMemory<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef ATOMIC_GPU_MEMORY_H
+#define ATOMIC_GPU_MEMORY_H
+
+#define BLOCK_1D 64
+
+#include "pair_gpu_device.h"
+#include "pair_gpu_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+template <class numtyp, class acctyp>
+class AtomicGPUMemory {
+ public:
+  AtomicGPUMemory();
+  virtual ~AtomicGPUMemory();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device **/
+  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size, 
+                   const double gpu_split, FILE *screen, 
+                   const char *pair_program);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(inum, nall, success))
+      pos_tex.bind_float(atom->dev_x,4);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum, 
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (nbor_time_avail) {
+      nbor->time_nbor.add_to_total();
+      nbor->time_kernel.add_to_total();
+      nbor_time_avail=false;
+    }
+    time_pair.add_to_total();
+    atom->acc_timers();
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    nbor_time_avail=false;
+    time_pair.zero();
+    atom->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int timestep, const int f_ago, const int inum_full,
+               const int nall, double **host_x, int *host_type,
+               int *ilist, int *numj, int **firstneigh, const bool eflag,
+               const bool vflag, const bool eatom, const bool vatom,
+               int &host_start, const double cpu_time, bool &success);
+
+  /// Pair loop with device neighboring
+  int * compute(const int timestep, const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type, double *boxlo,
+                double *boxhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                const double cpu_time, bool &success);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  PairGPUDevice<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  PairGPUBalance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  PairGPUAtom<numtyp,acctyp> *atom;
+
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  PairGPUNbor *nbor;
+
+  /// True if we need to accumulate time for neighboring
+  bool nbor_time_avail;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_pair_fast, k_pair;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size;
+  double _max_bytes, _max_an_bytes;
+
+  void compile_kernels(UCL_Device &dev, const char *pair_string);
+
+  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+};
+
+#endif
+
+
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@ -0,0 +1,270 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include "charge_gpu_memory.h"
+#define ChargeGPUMemoryT ChargeGPUMemory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
+  device=&pair_gpu_device;
+}
+
+template <class numtyp, class acctyp>
+ChargeGPUMemoryT::~ChargeGPUMemory() {
+}
+
+template <class numtyp, class acctyp>
+int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                   const int max_nbors, const int maxspecial,
+                                   const double cell_size,
+                                   const double gpu_split, FILE *_screen,
+                                   const char *pair_program) {
+  nbor_time_avail=false;
+  screen=_screen;
+
+  bool gpu_nbor=false;
+  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=true;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  if (host_nlocal>0)
+    _gpu_host=1;
+
+  if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
+                    _gpu_host,max_nbors,cell_size,false))
+    return false;
+  ucl_device=device->gpu;
+  atom=&device->atom;
+  nbor=&device->nbor;
+
+  _block_size=BLOCK_1D;
+  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
+    _block_size=ucl_device->group_size();
+  compile_kernels(*ucl_device,pair_program);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+  q_tex.bind_float(atom->dev_q,1);
+
+  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void ChargeGPUMemoryT::clear_atomic() {
+  // Output any timing information
+  acc_timers();
+  double avg_split=hd_balancer.all_avg_split();
+  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+
+  if (_compiled) {
+    k_pair_fast.clear();
+    k_pair.clear();
+    delete pair_program;
+    _compiled=false;
+  }
+
+  time_pair.clear();
+  hd_balancer.clear();
+
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
+                                   int *numj, int **firstneigh, bool &success) {
+  success=true;
+
+  nbor_time_avail=true;
+
+  int mn=nbor->max_nbor_loop(inum,numj);
+  resize_atom(inum,nall,success);
+  resize_local(inum,mn,success);
+  if (!success)
+    return false;
+
+  nbor->get_host(inum,ilist,numj,firstneigh,block_size());
+
+  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+
+  return ilist;
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
+                                              const int host_inum,
+                                              const int nall, double **host_x,
+                                              int *host_type, double *boxlo,
+                                              double *boxhi, int *tag, 
+                                              int **nspecial, int **special,
+                                              bool &success) {
+  nbor_time_avail=true;
+
+  success=true;
+  resize_atom(inum,nall,success);
+  resize_local(inum,host_inum,nbor->max_nbors(),success);
+  if (!success)
+    return;
+  atom->cast_copy_x(host_x,host_type);
+
+  int mn;
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+                        nspecial, special, success, mn);
+
+  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  if (bytes>_max_an_bytes)
+    _max_an_bytes=bytes;
+}
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
+                               const int inum_full, const int nall,
+                              double **host_x, int *host_type,
+                              int *ilist, int *numj, int **firstneigh,
+                              const bool eflag, const bool vflag,
+                              const bool eatom, const bool vatom,
+                              int &host_start, const double cpu_time,
+                              bool &success, double *host_q) {
+  acc_timers();
+  if (inum_full==0) {
+    zero_timers();
+    return;
+  }
+  
+  int ago=hd_balancer.ago_first(f_ago);
+  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
+		               nbor->gpu_nbor());
+  atom->inum(inum);
+  host_start=inum;
+
+  if (ago==0) {
+    reset_nbors(nall, inum, ilist, numj, firstneigh, success);
+    if (!success)
+      return;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  hd_balancer.start_timer();
+  atom->add_x_data(host_x,host_type);
+  atom->add_other_data();
+
+  loop(eflag,vflag);
+  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  hd_balancer.stop_timer();
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, virials, energies
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
+                                const int inum_full, const int nall, 
+                                double **host_x, int *host_type, double *boxlo,
+                                double *boxhi, int *tag, int **nspecial,
+                                int **special, const bool eflag, 
+                                const bool vflag, const bool eatom,
+                                const bool vatom, int &host_start,
+                                const double cpu_time, bool &success,
+                                double *host_q) {
+  acc_timers();
+  if (inum_full==0) {
+    zero_timers();
+    return NULL;
+  }
+  
+  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
+  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
+  atom->inum(inum);
+  host_start=inum;
+ 
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+                    boxlo, boxhi, tag, nspecial, special, success);
+    if (!success)
+      return NULL;
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+  } else {
+    atom->cast_x_data(host_x,host_type);
+    atom->cast_q_data(host_q);
+    hd_balancer.start_timer();
+    atom->add_x_data(host_x,host_type);
+  }
+  atom->add_other_data();
+
+  loop(eflag,vflag);
+  atom->copy_answers(eflag,vflag,eatom,vatom);
+  hd_balancer.stop_timer();
+  
+  return device->nbor.host_nbor.begin();
+}
+
+template <class numtyp, class acctyp>
+double ChargeGPUMemoryT::host_memory_usage_atomic() const {
+  return device->atom.host_memory_usage()+
+         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
+         sizeof(ChargeGPUMemory<numtyp,acctyp>);
+}
+
+template <class numtyp, class acctyp>
+void ChargeGPUMemoryT::compile_kernels(UCL_Device &dev, const char *pair_str) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(pair_str,flags.c_str());
+  k_pair_fast.set_function(*pair_program,"kernel_pair_fast");
+  k_pair.set_function(*pair_program,"kernel_pair");
+  pos_tex.get_texture(*pair_program,"pos_tex");
+  q_tex.get_texture(*pair_program,"q_tex");
+
+  _compiled=true;
+}
+
+template class ChargeGPUMemory<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CHARGE_GPU_MEMORY_H
+#define CHARGE_GPU_MEMORY_H
+
+#define BLOCK_1D 64
+
+#include "pair_gpu_device.h"
+#include "pair_gpu_balance.h"
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+template <class numtyp, class acctyp>
+class ChargeGPUMemory {
+ public:
+  ChargeGPUMemory();
+  virtual ~ChargeGPUMemory();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device **/
+  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
+                   const double gpu_split, FILE *screen,
+                   const char *pair_program);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(inum, nall, success)) {
+      pos_tex.bind_float(atom->dev_x,4);
+      q_tex.bind_float(atom->dev_q,1);
+    }
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note olist_size=total number of local particles **/
+  inline void resize_local(const int inum, const int max_nbors, bool &success) {
+    nbor->resize(inum,max_nbors,success);
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note nlocal+host_inum=total number local particles
+    * \note olist_size=0 **/
+  inline void resize_local(const int inum, const int host_inum, 
+                           const int max_nbors, bool &success) {
+    nbor->resize(inum,host_inum,max_nbors,success);
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear_atomic();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom_atomic(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage_atomic() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (nbor_time_avail) {
+      nbor->time_nbor.add_to_total();
+      nbor->time_kernel.add_to_total();
+      nbor_time_avail=false;
+    }
+    time_pair.add_to_total();
+    atom->acc_timers();
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    nbor_time_avail=false;
+    time_pair.zero();
+    atom->zero_timers();
+  }
+
+  /// Copy neighbor list from host
+  int * reset_nbors(const int nall, const int inum, int *ilist, int *numj,
+                    int **firstneigh, bool &success);
+
+  /// Build neighbor list on device
+  void build_nbor_list(const int inum, const int host_inum,
+                       const int nall, double **host_x, int *host_type,
+                       double *boxlo, double *boxhi, int *tag, int **nspecial,
+                       int **special, bool &success);
+
+  /// Pair loop with host neighboring
+  void compute(const int timestep, const int f_ago, const int inum_full,
+               const int nall, double **host_x, int *host_type,
+               int *ilist, int *numj, int **firstneigh, const bool eflag,
+               const bool vflag, const bool eatom, const bool vatom,
+               int &host_start, const double cpu_time, bool &success,
+               double *charge);
+
+  /// Pair loop with device neighboring
+  int * compute(const int timestep, const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type, double *boxlo,
+                double *boxhi, int *tag, int **nspecial,
+                int **special, const bool eflag, const bool vflag, 
+                const bool eatom, const bool vatom, int &host_start, 
+                const double cpu_time, bool &success, double *charge);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  PairGPUDevice<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_pair;
+
+  /// Host device load balancer
+  PairGPUBalance<numtyp,acctyp> hd_balancer;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  PairGPUAtom<numtyp,acctyp> *atom;
+
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  PairGPUNbor *nbor;
+
+  /// True if we need to accumulate time for neighboring
+  bool nbor_time_avail;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program;
+  UCL_Kernel k_pair_fast, k_pair;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _compiled;
+  int _block_size;
+  double  _max_bytes, _max_an_bytes;
+
+  void compile_kernels(UCL_Device &dev, const char *pair_string);
+
+  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+};
+
+#endif
+
--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "cmm_cut_gpu_memory.h"
+
+using namespace std;
+
+static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int inum, const int nall, const int max_nbors, 
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen) {
+  CMMMF.clear();
+  gpu_mode=CMMMF.device->gpu_mode();
+  double gpu_split=CMMMF.device->particle_split();
+  int first_gpu=CMMMF.device->first_device();
+  int last_gpu=CMMMF.device->last_device();
+  int world_me=CMMMF.device->world_me();
+  int gpu_rank=CMMMF.device->gpu_rank();
+  int procs_per_gpu=CMMMF.device->procs_per_gpu();
+
+  CMMMF.device->init_message(screen,"cg/cmm",first_gpu,last_gpu);
+
+  bool message=false;
+  if (world_me==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  if (world_me==0) {
+    bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+                            host_lj4, offset, special_lj, inum, nall, 300,
+                            maxspecial, cell_size, gpu_split, screen);
+    if (!init_ok)
+      return false;
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0) {
+      bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
+                              host_lj4, offset, special_lj, inum, nall, 300,
+                              maxspecial, cell_size, gpu_split,
+			      screen);
+      if (!init_ok)
+        return false;
+    }
+    MPI_Barrier(CMMMF.device->gpu_comm);
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return true;
+}
+
+void cmm_gpu_clear() {
+  CMMMF.clear();
+}
+
+int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success) {
+  return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, cpu_time, success);
+}  
+			
+void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success) {
+  CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double cmm_gpu_bytes() {
+  return CMMMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@ -0,0 +1,296 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CMM_GPU_KERNEL
+#define CMM_GPU_KERNEL
+
+#define MAX_SHARED_TYPES 8
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+texture<float4> pos_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global acctyp4 *ans, __global acctyp *engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nall, const int nbor_pitch) {
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X;
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].x) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp inv1,inv2;
+        
+        if (lj1[mtype].y == 2) {
+          inv1=r2inv*r2inv;
+          inv2=inv1*inv1;
+        } else if (lj1[mtype].y == 1) {
+          inv2=r2inv*sqrt(r2inv);
+          inv1=inv2*inv2;
+        } else {
+          inv1=r2inv*r2inv*r2inv;
+          inv2=inv1;
+        }
+        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+        if (eflag>0)
+          energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
+                    lj3[mtype].z;
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in,__global int *dev_nbor, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nall, const int nbor_pitch) {
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (ii<4)
+    sp_lj[ii]=sp_lj_in[ii];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[ii]=lj1_in[ii];
+    if (eflag>0)
+      lj3[ii]=lj3_in[ii];
+  }
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  __syncthreads();
+  
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].x) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp inv1,inv2;
+        
+        if (lj1[mtype].y == (numtyp)2) {
+          inv1=r2inv*r2inv;
+          inv2=inv1*inv1;
+        } else if (lj1[mtype].y == (numtyp)1) {
+          inv2=r2inv*sqrt(r2inv);
+          inv1=inv2*inv2;
+        } else {
+          inv1=r2inv*r2inv*r2inv;
+          inv2=inv1;
+        }
+        numtyp force = factor_lj*r2inv*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+        if (eflag>0)
+          energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
+                    lj3[mtype].z;
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@ -0,0 +1,150 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "cmm_cut_gpu_cl.h"
+#else
+#include "cmm_cut_gpu_ptx.h"
+#endif
+
+#include "cmm_cut_gpu_memory.h"
+#include <cassert>
+#define CMM_GPU_MemoryT CMM_GPU_Memory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+CMM_GPU_MemoryT::CMM_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CMM_GPU_MemoryT::~CMM_GPU_Memory() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                           int **host_cg_type, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size, 
+                           const double gpu_split, FILE *_screen) {
+  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                    _screen,cmm_cut_gpu_kernel);
+
+  // If atom type constants fit in shared memory use fast kernel
+  int cmm_types=ntypes;
+  shared_types=false;
+  if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+    cmm_types=MAX_SHARED_TYPES;
+    shared_types=true;
+  }
+  _cmm_types=cmm_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(cmm_types*cmm_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<cmm_types*cmm_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,cmm_types,lj1,host_write,host_cutsq, 
+                         host_cg_type,host_lj1,host_lj2);
+
+  lj3.alloc(cmm_types*cmm_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,cmm_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void CMM_GPU_MemoryT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CMM_GPU_MemoryT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CMM_GPU_Memory<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+
+  int ainum=this->atom->inum();
+  int anall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->atom->dev_ans.begin(),
+                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->atom->dev_ans.begin(),
+                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch);
+  }
+  this->time_pair.stop();
+}
+
+template class CMM_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/cmm_cut_gpu_memory.h
+++ b/lib/gpu/cmm_cut_gpu_memory.h
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CMM_GPU_MEMORY_H
+#define CMM_GPU_MEMORY_H
+
+#include "atomic_gpu_memory.h"
+
+template <class numtyp, class acctyp>
+class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
+ public:
+  CMM_GPU_Memory();
+  ~CMM_GPU_Memory(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device **/
+  bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
+            double **host_lj1, double **host_lj2, double **host_lj3,
+            double **host_lj4, double **host_offset, double *host_special_lj,
+            const int nlocal, const int nall, const int max_nbors, 
+            const int maxspecial, const double cell_size,
+            const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = cutsq, lj1.y=cg_type, lj1.z = lj1, lj1.w = lj2
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _cmm_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+#endif
+
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "cmmc_long_gpu_memory.h"
+
+using namespace std;
+
+static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+                   double **host_lj1, double **host_lj2, double **host_lj3, 
+                   double **host_lj4, double **offset, double *special_lj,
+                   const int inum, const int nall, const int max_nbors, 
+                   const int maxspecial, const double cell_size, int &gpu_mode,
+                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                   double *host_special_coul, const double qqrd2e,
+                   const double g_ewald) {
+  CMMLMF.clear();
+  gpu_mode=CMMLMF.device->gpu_mode();
+  double gpu_split=CMMLMF.device->particle_split();
+  int first_gpu=CMMLMF.device->first_device();
+  int last_gpu=CMMLMF.device->last_device();
+  int world_me=CMMLMF.device->world_me();
+  int gpu_rank=CMMLMF.device->gpu_rank();
+  int procs_per_gpu=CMMLMF.device->procs_per_gpu();
+
+  CMMLMF.device->init_message(screen,"cg/cmm/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (world_me==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  if (world_me==0) {
+    bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
+                             host_lj3, host_lj4, offset, special_lj, inum, 
+                             nall, 300, maxspecial, cell_size, gpu_split, 
+                             screen, host_cut_ljsq, host_cut_coulsq,
+                             host_special_coul, qqrd2e,g_ewald);
+    if (!init_ok)
+      return false;
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0) {
+      bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
+                               host_lj3, host_lj4, offset, special_lj, inum, 
+                               nall, 300, maxspecial, cell_size, gpu_split,
+                               screen, host_cut_ljsq, host_cut_coulsq,
+                               host_special_coul, qqrd2e, g_ewald);
+      if (!init_ok)
+        return false;
+    }
+    MPI_Barrier(CMMLMF.device->gpu_comm);
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return true;
+}
+
+void cmml_gpu_clear() {
+  CMMLMF.clear();
+}
+
+int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         const double cpu_time, bool &success, double *host_q) {
+  return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, cpu_time, success, host_q);
+}  
+			
+void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success, double *host_q) {
+  CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                host_q);
+}
+
+double cmml_gpu_bytes() {
+  return CMMLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@ -0,0 +1,378 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CMML_GPU_KERNEL
+#define CMML_GPU_KERNEL
+
+#define MAX_SHARED_TYPES 8
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#define EWALD_F (numtyp)1.12837917
+#define EWALD_P (numtyp)0.3275911
+#define A1 (numtyp)0.254829592
+#define A2 (numtyp)-0.284496736
+#define A3 (numtyp)1.421413741
+#define A4 (numtyp)-1.453152027
+#define A5 (numtyp)1.061405429
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+__inline double fetch_q(const int& i, const double *q)
+{
+  return q[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+__inline float fetch_q(const int& i, const float *q)
+{
+  return tex1Dfetch(q_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+#define fetch_q(i,y) q_[i]
+
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global acctyp4 *ans, __global acctyp *engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nall, const int nbor_pitch,
+                          __global numtyp *q_ , const numtyp cut_coulsq,
+                          const numtyp qqrd2e, const numtyp g_ewald) {
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X;
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  if (ii<inum) {
+    acctyp energy=(numtyp)0;
+    acctyp e_coul=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int itype=ix.w;
+
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      if (j < nall) {
+        factor_lj = (numtyp)1.0;
+        factor_coul = (numtyp)0.0;
+      } else {
+        factor_lj = sp_lj[j/nall];
+        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].x) {
+        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
+        numtyp r2inv=(numtyp)1.0/rsq;
+
+        if (rsq < lj1[mtype].y) {
+          if (lj3[mtype].x == (numtyp)2) {
+            inv1=r2inv*r2inv;
+            inv2=inv1*inv1;
+          } else if (lj3[mtype].x == (numtyp)1) {
+            inv2=r2inv*sqrt(r2inv);
+            inv1=inv2*inv2;
+          } else {
+            inv1=r2inv*r2inv*r2inv;
+            inv2=inv1;
+          }
+          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = sqrt(rsq);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = exp(-grij*grij);
+          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].y) {
+            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
+                      lj3[mtype].w;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nall, const int nbor_pitch,
+                               __global numtyp *q_ , const numtyp cut_coulsq,
+                               const numtyp qqrd2e, const numtyp g_ewald) {
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (ii<8)
+    sp_lj[ii]=sp_lj_in[ii];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[ii]=lj1_in[ii];
+    lj3[ii]=lj3_in[ii];
+  }
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  __syncthreads();
+  
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp e_coul=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    numtyp qtmp=fetch_q(i,q_);
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+
+      numtyp factor_lj, factor_coul;
+      if (j < nall) {
+        factor_lj = (numtyp)1.0;
+        factor_coul = (numtyp)0.0;
+      } else {
+        factor_lj = sp_lj[j/nall];
+        factor_coul = (numtyp)1.0-sp_lj[j/nall+4];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].x) {
+        numtyp forcecoul, force_lj, force, inv1, inv2, prefactor, _erfc;
+        numtyp r2inv=(numtyp)1.0/rsq;
+
+        if (rsq < lj1[mtype].y) {
+          if (lj3[mtype].x == (numtyp)2) {
+            inv1=r2inv*r2inv;
+            inv2=inv1*inv1;
+          } else if (lj3[mtype].x == (numtyp)1) {
+            inv2=r2inv*sqrt(r2inv);
+            inv1=inv2*inv2;
+          } else {
+            inv1=r2inv*r2inv*r2inv;
+            inv2=inv1;
+          }
+          force_lj = factor_lj*inv1*(lj1[mtype].z*inv2-lj1[mtype].w);
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = sqrt(rsq);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = exp(-grij*grij);
+          numtyp t = (numtyp)1.0 / ((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else {
+          forcecoul = (numtyp)0.0;
+          prefactor = (numtyp)0.0;
+        }
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].y) {
+            energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
+                      lj3[mtype].w;
+          } 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+      *ap1=e_coul;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@ -0,0 +1,164 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "cmmc_long_gpu_cl.h"
+#else
+#include "cmmc_long_gpu_ptx.h"
+#endif
+
+#include "cmmc_long_gpu_memory.h"
+#include <cassert>
+#define CMML_GPU_MemoryT CMML_GPU_Memory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+CMML_GPU_MemoryT::CMML_GPU_Memory() : ChargeGPUMemory<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CMML_GPU_MemoryT::~CMML_GPU_Memory() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                            int **host_cg_type, double **host_lj1, 
+                            double **host_lj2, double **host_lj3, 
+                            double **host_lj4, double **host_offset, 
+                            double *host_special_lj, const int nlocal,
+                            const int nall, const int max_nbors,
+                            const int maxspecial, const double cell_size,
+                            const double gpu_split, FILE *_screen,
+                            double **host_cut_ljsq, 
+                            const double host_cut_coulsq,
+                            double *host_special_coul, const double qqrd2e,
+                            const double g_ewald) {
+  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                    _screen,cmmc_long_gpu_kernel);
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+    lj_types=MAX_SHARED_TYPES;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_cutsq,
+                         host_cut_ljsq,host_lj1,host_lj2);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_cg_type,host_lj3,
+                         host_lj4,host_offset);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void CMML_GPU_MemoryT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CMML_GPU_MemoryT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CMML_GPU_Memory<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+
+  int ainum=this->atom->inum();
+  int anall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->atom->dev_ans.begin(),
+                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch,
+                          &this->atom->dev_q.begin(), &_cut_coulsq,
+                          &_qqrd2e, &_g_ewald);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->atom->dev_ans.begin(),
+                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->atom->dev_q.begin(),
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
+  }
+  this->time_pair.stop();
+}
+
+template class CMML_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/cmmc_long_gpu_memory.h
+++ b/lib/gpu/cmmc_long_gpu_memory.h
@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef CMML_GPU_MEMORY_H
+#define CMML_GPU_MEMORY_H
+
+#include "charge_gpu_memory.h"
+
+template <class numtyp, class acctyp>
+class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
+ public:
+  CMML_GPU_Memory();
+  ~CMML_GPU_Memory();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device **/
+  bool init(const int ntypes, double **host_cutsq, int ** cg_type,
+            double **host_lj1, double **host_lj2, double **host_lj3,
+            double **host_lj4, double **host_offset, double *host_special_lj,
+            const int nlocal, const int nall, const int max_nbors, 
+            const int maxspecial, const double cell_size, 
+            const double gpu_split, FILE *screen, double **host_cut_ljsq,
+            const double host_cut_coulsq, double *host_special_coul,
+            const double qqrd2e, const double g_ewald);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = cutsq, lj1.y = cutsq_vdw, lj1.z = lj1, lj1.w = lj2, 
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = cg_type, lj3.y = lj3, lj3.z = lj4, lj3.w = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+#endif
+
--- a/lib/gpu/cudpp_mini/README
+++ b/lib/gpu/cudpp_mini/README
@ -0,0 +1,5 @@
+This is a stripped down and customized version
+of the CUDA performance primitives library for
+use with the GPU package in LAMMPS.
+Don't use for anything else, get the real thing
+from http://code.google.com/p/cudpp/ instead!
--- a/lib/gpu/cudpp_mini/cta/radixsort_cta.cu
+++ b/lib/gpu/cudpp_mini/cta/radixsort_cta.cu
@ -0,0 +1,337 @@
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#include <cudpp_globals.h>
+#include "cudpp_radixsort.h"
+#include "cta/scan_cta.cu"
+#include <cudpp.h>
+#include <stdio.h>
+
+#include <cudpp_util.h>
+#include <math.h>
+#include "sharedmem.h"
+
+
+#ifdef __DEVICE_EMULATION__
+#define __EMUSYNC __syncthreads()
+#else
+#define __EMUSYNC
+#endif
+
+/**
+ * @file
+ * sort_cta.cu
+ * 
+ * @brief CUDPP CTA-level sort routines
+ */
+
+/** \addtogroup cudpp_cta 
+* @{
+*/
+
+/** @name Radix Sort Functions
+* @{
+*/
+
+
+typedef unsigned int uint;
+
+/**
+ * @brief Flips bits of single-precision floating-point number (parameterized by doFlip)
+ * 
+ *  flip a float for sorting
+ *  finds SIGN of fp number.
+ *  if it's 1 (negative float), it flips all bits
+ *  if it's 0 (positive float), it flips the sign only
+ * @param[in] f floating-point input (passed as unsigned int)
+ * @see floatUnflip
+**/
+
+template <bool doFlip>
+__device__ uint floatFlip(uint f)
+{
+    if (doFlip)
+    {
+        uint mask = -int(f >> 31) | 0x80000000;
+        return f ^ mask;
+    }
+    else
+        return f;
+}
+
+/**
+ * @brief Reverses bit-flip of single-precision floating-point number (parameterized by doFlip)
+ * 
+ * flip a float back (invert FloatFlip)
+ *  signed was flipped from above, so:
+ *  if sign is 1 (negative), it flips the sign bit back
+ *  if sign is 0 (positive), it flips all bits back
+ * @param[in] f floating-point input (passed as unsigned int)
+ * @see floatFlip
+**/
+template <bool doFlip>
+__device__ uint floatUnflip(uint f)
+{
+    if (doFlip)
+    {
+        uint mask = ((f >> 31) - 1) | 0x80000000;
+        return f ^ mask;
+    }
+    else
+        return f;
+}
+
+/**
+ * @brief Scans one warp quickly, optimized for 32-element warps, using shared memory
+ * 
+ * Scans each warp in parallel ("warp-scan"), one element per thread.
+ * uses 2 numElements of shared memory per thread (64 numElements per warp)
+ * 
+ * @param[in] val Elements per thread to scan
+ * @param[in,out] sData
+**/
+template<class T, int maxlevel>
+__device__ T scanwarp(T val, volatile T* sData)
+{
+    // The following is the same as 2 * WARP_SIZE * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE - 1))
+    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE - 1));
+    sData[idx] = 0;
+    idx += WARP_SIZE;
+    T t = sData[idx] = val;          __EMUSYNC;
+
+#ifdef __DEVICE_EMULATION__             
+        t = sData[idx -  1]; __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx -  2];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx -  4];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx -  8];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+        t = sData[idx - 16];   __EMUSYNC; 
+        sData[idx] += t;       __EMUSYNC;
+#else
+        if (0 <= maxlevel) { sData[idx] = t = t + sData[idx - 1]; } __EMUSYNC;
+        if (1 <= maxlevel) { sData[idx] = t = t + sData[idx - 2]; } __EMUSYNC;
+        if (2 <= maxlevel) { sData[idx] = t = t + sData[idx - 4]; } __EMUSYNC;
+        if (3 <= maxlevel) { sData[idx] = t = t + sData[idx - 8]; } __EMUSYNC;
+        if (4 <= maxlevel) { sData[idx] = t = t + sData[idx -16]; } __EMUSYNC;
+#endif          
+        return sData[idx] - val;  // convert inclusive -> exclusive
+}
+
+/**
+ * @brief Scans 4*CTA_SIZE unsigned ints in a block
+ *
+ * scan4 scans 4*CTA_SIZE numElements in a block (4 per
+ * thread), using a warp-scan algorithm
+ * 
+ * @param[in] idata 4-vector of integers to scan
+**/
+__device__ uint4 scan4(uint4 idata)
+{    
+    extern  __shared__  uint ptr[];
+    
+    uint idx = threadIdx.x;
+
+    uint4 val4 = idata;
+    uint sum[3];
+    sum[0] = val4.x;
+    sum[1] = val4.y + sum[0];
+    sum[2] = val4.z + sum[1];
+    
+    uint val = val4.w + sum[2];
+    
+    val = scanwarp<uint, 4>(val, ptr);
+    __syncthreads();
+
+    if ((idx & (WARP_SIZE - 1)) == WARP_SIZE - 1)
+    {
+        ptr[idx >> 5] = val + val4.w + sum[2];
+    }
+    __syncthreads();
+
+#ifndef __DEVICE_EMULATION__
+    if (idx < WARP_SIZE)
+#endif
+    {
+        ptr[idx] = scanwarp<uint, 2>(ptr[idx], ptr);
+    }
+    __syncthreads();
+
+    val += ptr[idx >> 5];
+
+    val4.x = val;
+    val4.y = val + sum[0];
+    val4.z = val + sum[1];
+    val4.w = val + sum[2];      
+        
+    return val4;
+}
+
+/**
+ * @brief Computes output position for each thread given predicate; trues come first then falses
+ * 
+ * Rank is the core of the radix sort loop.  Given a predicate, it
+ * computes the output position for each thread in an ordering where all
+ * True threads come first, followed by all False threads. 
+ * This version handles 4 predicates per thread; hence, "rank4".
+ *
+ * @param[in] preds true/false values for each of the 4 elements in this thread
+ *
+ * @todo is the description of "preds" correct?
+**/
+template <int ctasize>
+__device__ uint4 rank4(uint4 preds)
+{
+    uint4 address = scan4(preds);  
+
+    __shared__ uint numtrue;
+    if (threadIdx.x == ctasize-1)
+    {
+        numtrue = address.w + preds.w;
+    }
+    __syncthreads();
+
+    uint4 rank;
+    uint idx = threadIdx.x << 2;
+    rank.x = (preds.x) ? address.x : numtrue + idx   - address.x;
+    rank.y = (preds.y) ? address.y : numtrue + idx + 1 - address.y;
+    rank.z = (preds.z) ? address.z : numtrue + idx + 2 - address.z;
+    rank.w = (preds.w) ? address.w : numtrue + idx + 3 - address.w;     
+                
+    return rank;
+}
+
+/**
+ * @brief Sorts one block
+ *
+ * Uses rank to sort one bit at a time: Sorts a block according
+ * to bits startbit -> nbits + startbit
+ * @param[in,out] key
+ * @param[in,out] value
+**/
+template<uint nbits, uint startbit>
+__device__ void radixSortBlock(uint4 &key, uint4 &value)
+{
+    extern __shared__ uint sMem1[];
+    for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+    {        
+        uint4 lsb;
+        lsb.x = !((key.x >> shift) & 0x1);
+        lsb.y = !((key.y >> shift) & 0x1);
+        lsb.z = !((key.z >> shift) & 0x1);
+        lsb.w = !((key.w >> shift) & 0x1); 
+
+        uint4 r = rank4<256>(lsb);
+
+#if 1
+        // This arithmetic strides the ranks across 4 SORT_CTA_SIZE regions
+        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
+        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
+        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
+        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w; 
+        __syncthreads();
+
+        // The above allows us to read without 4-way bank conflicts:
+        key.x = sMem1[threadIdx.x];
+        key.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
+        key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
+        key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
+
+        __syncthreads();
+
+        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = value.x;
+        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = value.y;
+        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = value.z;
+        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = value.w;
+        __syncthreads();
+
+        value.x = sMem1[threadIdx.x];
+        value.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
+        value.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
+        value.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
+#else
+        sMem1[r.x] = key.x;
+        sMem1[r.y] = key.y;
+        sMem1[r.z] = key.z;
+        sMem1[r.w] = key.w;
+        __syncthreads();
+
+        // This access has 4-way bank conflicts
+        key = sMem[threadIdx.x];
+
+        __syncthreads();
+
+        sMem1[r.x] = value.x;
+        sMem1[r.y] = value.y;
+        sMem1[r.z] = value.z;
+        sMem1[r.w] = value.w;
+        __syncthreads();
+
+        value = sMem[threadIdx.x];
+#endif
+
+        __syncthreads();
+    }
+}
+
+/**
+ * @brief Sorts one block. Key-only version.
+ *
+ * Uses rank to sort one bit at a time: Sorts a block according
+ * to bits startbit -> nbits + startbit
+ * @param[in,out] key
+**/
+
+template<uint nbits, uint startbit>
+__device__ void radixSortBlockKeysOnly(uint4 &key)
+{
+    extern __shared__ uint sMem1[];
+    for(uint shift = startbit; shift < (startbit + nbits); ++shift)
+    {                   
+        uint4 lsb;
+        lsb.x = !((key.x >> shift) & 0x1);
+        lsb.y = !((key.y >> shift) & 0x1);
+        lsb.z = !((key.z >> shift) & 0x1);
+        lsb.w = !((key.w >> shift) & 0x1);
+
+        uint4 r = rank4<256>(lsb);
+
+#if 1
+        // This arithmetic strides the ranks across 4 CTA_SIZE regions
+        sMem1[(r.x & 3) * SORT_CTA_SIZE + (r.x >> 2)] = key.x;
+        sMem1[(r.y & 3) * SORT_CTA_SIZE + (r.y >> 2)] = key.y;
+        sMem1[(r.z & 3) * SORT_CTA_SIZE + (r.z >> 2)] = key.z;
+        sMem1[(r.w & 3) * SORT_CTA_SIZE + (r.w >> 2)] = key.w;
+        __syncthreads();
+
+        // The above allows us to read without 4-way bank conflicts:
+        key.x = sMem1[threadIdx.x];
+        key.y = sMem1[threadIdx.x +     SORT_CTA_SIZE];
+        key.z = sMem1[threadIdx.x + 2 * SORT_CTA_SIZE];
+        key.w = sMem1[threadIdx.x + 3 * SORT_CTA_SIZE];
+#else
+        sMem1[r.x] = key.x;
+        sMem1[r.y] = key.y;
+        sMem1[r.z] = key.z;
+        sMem1[r.w] = key.w;
+        __syncthreads();
+
+        // This access has 4-way bank conflicts
+        key = sMem[threadIdx.x];
+#endif
+
+        __syncthreads();
+    }
+}
+
+/** @} */ // end radix sort functions
+/** @} */ // end cudpp_cta
--- a/lib/gpu/cudpp_mini/cta/scan_cta.cu
+++ b/lib/gpu/cudpp_mini/cta/scan_cta.cu
@ -0,0 +1,619 @@
+// ------------------------------------------------------------- 
+//  cuDPP -- CUDA Data Parallel Primitives library
+//  -------------------------------------------------------------
+//  $Revision: 5633 $
+//  $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * scan_cta.cu
+ *
+ * @brief CUDPP CTA-level scan routines
+ */
+
+/** \defgroup cudpp_cta CUDPP CTA-Level API
+  * The CUDPP CTA-Level API contains functions that run on the GPU 
+  * device.  These are CUDA \c __device__ functions that are called
+  * from within other CUDA device functions (typically 
+  * \link cudpp_kernel CUDPP Kernel-Level API\endlink functions).
+  * They are called CTA-level functions because they typically process
+  * s_data "owned" by each CTA within shared memory, and are agnostic of
+  * any other CTAs that may be running (or how many CTAs are running),
+  * other than to compute appropriate global memory addresses.
+  * @{
+  */
+
+/** @name Scan Functions
+* @{
+*/
+
+#include <cudpp_globals.h>
+#include <cudpp_util.h>
+#include <math.h>
+#include <cudpp.h>
+
+/**
+ * @brief Macro to insert necessary __syncthreads() in device emulation mode
+ */
+#ifdef __DEVICE_EMULATION__
+#define __EMUSYNC __syncthreads()
+#else
+#define __EMUSYNC
+#endif
+
+/** 
+  * @brief Template class containing compile-time parameters to the scan functions
+  *
+  * ScanTraits is passed as a template parameter to all scan functions.  By 
+  * using these compile-time functions we can enable generic code while 
+  * maintaining the highest performance.  This is crucial for the performance 
+  * of low-level workhorse algorithms like scan.
+  *
+  * @param T The datatype of the scan
+  * @param oper The ::CUDPPOperator to use for the scan (add, max, etc.)
+  * @param multiRow True if this is a multi-row scan
+  * @param unroll True if scan inner loops should be unrolled
+  * @param sums True if each block should write it's sum to the d_blockSums array (false for single-block scans)
+  * @param backward True if this is a backward scan
+  * @param fullBlock True if all blocks in this scan are full (CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements)
+  * @param exclusive True for exclusive scans, false for inclusive scans
+  */
+template <class T, CUDPPOperator oper, bool backward, bool exclusive,
+          bool multiRow, bool sums, bool fullBlock>
+class ScanTraits
+{
+public:
+    
+    //! Returns true if this is a backward scan
+    static __device__ bool isBackward()    { return backward; };
+    //! Returns true if this is an exclusive scan
+    static __device__ bool isExclusive()  { return exclusive; };
+    //! Returns true if this a multi-row scan.
+    static __device__ bool isMultiRow()    { return multiRow; };
+    //! Returns true if this scan writes the sum of each block to the d_blockSums array (multi-block scans)
+    static __device__ bool writeSums()     { return sums; };
+    //! Returns true if this is a full scan -- all blocks process CTA_SIZE * SCAN_ELEMENTS_PER_THREAD elements
+    static __device__ bool isFullBlock()   { return fullBlock; };
+    
+        
+    //! The operator function used for the scan
+    static __device__ T op(const T a, const T b)
+    {
+        return Operator<T, oper>::op(a, b);
+    }  
+
+    //! The identity value used by the scan
+    static __device__ T identity() { return Operator<T, oper>::identity(); }
+};
+
+//! This is used to insert syncthreads to avoid perf loss caused by 128-bit 
+//! load overlap that happens on G80.  This gives about a 15% boost on scans on 
+//! G80.
+//! @todo Parameterize this in case this perf detail changes on future GPUs.
+#define DISALLOW_LOADSTORE_OVERLAP 1
+
+/**
+* @brief Handles loading input s_data from global memory to shared memory 
+* (vec4 version)
+*
+* Load a chunk of 8*blockDim.x elements from global memory into a 
+* shared memory array.  Each thread loads two T4 elements (where
+* T4 is, e.g. int4 or float4), computes the scan of those two vec4s in 
+* thread local arrays (in registers), and writes the two total sums of the
+* vec4s into shared memory, where they will be cooperatively scanned with 
+* the other partial sums by all threads in the CTA.
+*
+* @param[out] s_out The output (shared) memory array
+* @param[out] threadScan0 Intermediate per-thread partial sums array 1
+* @param[out] threadScan1 Intermediate per-thread partial sums array 2
+* @param[in] d_in The input (device) memory array
+* @param[in] numElements The number of elements in the array being scanned
+* @param[in] iDataOffset the offset of the input array in global memory for this 
+* thread block
+* @param[out] ai The shared memory address for the thread's first element 
+* (returned for reuse)
+* @param[out] bi The shared memory address for the thread's second element 
+* (returned for reuse)
+* @param[out] aiDev The device memory address for this thread's first element 
+* (returned for reuse)
+* @param[out] biDev The device memory address for this thread's second element 
+* (returned for reuse)
+*/
+template <class T, class traits> 
+__device__ void loadSharedChunkFromMem4(T        *s_out,
+                                        T        threadScan0[4],
+                                        T        threadScan1[4],
+                                        const T  *d_in,
+                                        int      numElements, 
+                                        int      iDataOffset,
+                                        int      &ai, 
+                                        int      &bi, 
+                                        int      &aiDev, 
+                                        int      &biDev)
+{
+    int thid = threadIdx.x;
+    aiDev = iDataOffset + thid;
+    biDev = aiDev + blockDim.x;
+
+    // convert to 4-vector
+    typename typeToVector<T,4>::Result  tempData;
+    typename typeToVector<T,4>::Result* inData = (typename typeToVector<T,4>::Result*)d_in;
+
+    ai = thid;
+    bi = thid + blockDim.x;
+
+    // read into tempData;
+    if (traits::isBackward())
+    {
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements) 
+        {
+            tempData       = inData[aiDev];
+            threadScan0[3] = tempData.w;               
+            threadScan0[2] = traits::op(tempData.z, threadScan0[3]);
+            threadScan0[1] = traits::op(tempData.y, threadScan0[2]);
+            threadScan0[0] = s_out[ai] 
+                           = traits::op(tempData.x, threadScan0[1]);
+        }
+        else
+        {
+            threadScan0[3] = traits::identity();
+            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[3]);
+            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[2]);
+            threadScan0[0] = s_out[ai] 
+                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan0[1]);
+        }
+
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            tempData       = inData[biDev];
+            threadScan1[3] = tempData.w;
+            threadScan1[2] = traits::op(tempData.z, threadScan1[3]);
+            threadScan1[1] = traits::op(tempData.y, threadScan1[2]);
+            threadScan1[0] = s_out[bi] 
+                           = traits::op(tempData.x, threadScan1[1]);
+        }
+        else
+        {
+            threadScan1[3] = traits::identity();
+            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[3]);
+            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[2]);
+            threadScan1[0] = s_out[bi] 
+                           = traits::op((i     < numElements) ? d_in[i]   : traits::identity(), threadScan1[1]);
+        }
+        __syncthreads();
+
+        // reverse s_data in shared memory
+        if (ai < CTA_SIZE)
+        {       
+            unsigned int leftIdx = ai;
+            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
+                
+            if (leftIdx < rightIdx) 
+            {
+                T tmp           = s_out[leftIdx];
+                s_out[leftIdx]  = s_out[rightIdx];
+                s_out[rightIdx] = tmp;
+            }
+        }
+        __syncthreads();
+    }
+    else
+    {
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            tempData       = inData[aiDev];
+            threadScan0[0] = tempData.x;           
+            threadScan0[1] = traits::op(tempData.y, threadScan0[0]);
+            threadScan0[2] = traits::op(tempData.z, threadScan0[1]);
+            threadScan0[3] = s_out[ai] 
+                           = traits::op(tempData.w, threadScan0[2]);
+        }
+        else
+        {
+            threadScan0[0] = (i < numElements) ? d_in[i] : traits::identity();
+            threadScan0[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan0[0]);
+            threadScan0[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan0[1]);
+            threadScan0[3] = s_out[ai] 
+                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan0[2]);
+        }
+
+        
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            tempData       = inData[biDev];
+            threadScan1[0] = tempData.x;           
+            threadScan1[1] = traits::op(tempData.y, threadScan1[0]);
+            threadScan1[2] = traits::op(tempData.z, threadScan1[1]);
+            threadScan1[3] = s_out[bi] 
+                           = traits::op(tempData.w, threadScan1[2]);
+        }
+        else
+        {
+            threadScan1[0] = (i < numElements) ? d_in[i] : traits::identity();
+            threadScan1[1] = traits::op(((i+1) < numElements) ? d_in[i+1] : traits::identity(), threadScan1[0]);
+            threadScan1[2] = traits::op(((i+2) < numElements) ? d_in[i+2] : traits::identity(), threadScan1[1]);
+            threadScan1[3] = s_out[bi] 
+                           = traits::op(((i+3) < numElements) ? d_in[i+3] : traits::identity(), threadScan1[2]);
+        }  
+        __syncthreads();
+    }
+}
+
+
+/**
+* @brief Handles storing result s_data from shared memory to global memory 
+* (vec4 version)
+*
+* Store a chunk of SCAN_ELTS_PER_THREAD*blockDim.x elements from shared memory 
+* into a device memory array.  Each thread stores reads two elements from shared
+* memory, adds them to the intermediate sums computed in 
+* loadSharedChunkFromMem4(), and writes two T4 elements (where
+* T4 is, e.g. int4 or float4) to global memory.
+*
+* @param[out] d_out The output (device) memory array
+* @param[in] threadScan0 Intermediate per-thread partial sums array 1
+* (contents computed in loadSharedChunkFromMem4())
+* @param[in] threadScan1 Intermediate per-thread partial sums array 2
+* (contents computed in loadSharedChunkFromMem4())
+* @param[in] s_in The input (shared) memory array
+* @param[in] numElements The number of elements in the array being scanned
+* @param[in] oDataOffset the offset of the output array in global memory 
+* for this thread block
+* @param[in] ai The shared memory address for the thread's first element 
+* (computed in loadSharedChunkFromMem4())
+* @param[in] bi The shared memory address for the thread's second element 
+* (computed in loadSharedChunkFromMem4())
+* @param[in] aiDev The device memory address for this thread's first element 
+* (computed in loadSharedChunkFromMem4())
+* @param[in] biDev The device memory address for this thread's second element 
+* (computed in loadSharedChunkFromMem4())
+*/
+template <class T, class traits>
+__device__ void storeSharedChunkToMem4(T   *d_out,
+                                       T   threadScan0[4],
+                                       T   threadScan1[4],
+                                       T   *s_in,
+                                       int numElements, 
+                                       int oDataOffset,
+                                       int ai, 
+                                       int bi, 
+                                       int aiDev, 
+                                       int biDev)
+{
+    // Convert to 4-vector
+    typename typeToVector<T,4>::Result tempData;
+    typename typeToVector<T,4>::Result* outData = (typename typeToVector<T,4>::Result*)d_out;
+
+    // write results to global memory
+    if (traits::isBackward())
+    {   
+        if (ai < CTA_SIZE)
+        {
+
+            unsigned int leftIdx = ai;
+            unsigned int rightIdx = (2 * CTA_SIZE - 1) - ai;
+            
+            if (leftIdx < rightIdx) 
+            {
+                T tmp = s_in[leftIdx];
+                s_in[leftIdx] = s_in[rightIdx];
+                s_in[rightIdx] = tmp;
+            }
+        }
+        __syncthreads();
+
+        T temp = s_in[ai];
+
+        if (traits::isExclusive())
+        {
+            tempData.w = temp;
+            tempData.z = traits::op(temp, threadScan0[3]);
+            tempData.y = traits::op(temp, threadScan0[2]);
+            tempData.x = traits::op(temp, threadScan0[1]);
+        }
+        else
+        {
+            tempData.w = traits::op(temp, threadScan0[3]);
+            tempData.z = traits::op(temp, threadScan0[2]);
+            tempData.y = traits::op(temp, threadScan0[1]);
+            tempData.x = traits::op(temp, threadScan0[0]);
+        }
+
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            outData[aiDev] = tempData;
+        }
+        else
+        {
+            if (i   < numElements) { d_out[i]   = tempData.x;
+            if (i+1 < numElements) { d_out[i+1] = tempData.y;
+            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}     
+        }
+
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        temp = s_in[bi];
+
+        if (traits::isExclusive())
+        {
+            tempData.w = temp;
+            tempData.z = traits::op(temp, threadScan1[3]);
+            tempData.y = traits::op(temp, threadScan1[2]);
+            tempData.x = traits::op(temp, threadScan1[1]);
+        }
+        else
+        {
+            tempData.w = traits::op(temp, threadScan1[3]);
+            tempData.z = traits::op(temp, threadScan1[2]);
+            tempData.y = traits::op(temp, threadScan1[1]);
+            tempData.x = traits::op(temp, threadScan1[0]);
+        }
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            outData[biDev] = tempData;
+        }
+        else
+        {
+            if (i   < numElements) { d_out[i]   = tempData.x;
+            if (i+1 < numElements) { d_out[i+1] = tempData.y;
+            if (i+2 < numElements) { d_out[i+2] = tempData.z; }}}     
+        }
+    }
+    else
+    {
+        T temp;
+        temp = s_in[ai]; 
+
+        if (traits::isExclusive())
+        {
+            tempData.x = temp;
+            tempData.y = traits::op(temp, threadScan0[0]);
+            tempData.z = traits::op(temp, threadScan0[1]);
+            tempData.w = traits::op(temp, threadScan0[2]);
+        }
+        else
+        {
+            tempData.x = traits::op(temp, threadScan0[0]);
+            tempData.y = traits::op(temp, threadScan0[1]);
+            tempData.z = traits::op(temp, threadScan0[2]);
+            tempData.w = traits::op(temp, threadScan0[3]);
+        }
+
+        int i = aiDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {                       
+            outData[aiDev] = tempData; 
+        }
+        else 
+        {       
+            // we can't use vec4 because the original array isn't a multiple of 
+            // 4 elements
+            if ( i    < numElements) { d_out[i]   = tempData.x;
+            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
+            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
+        }
+
+#ifdef DISALLOW_LOADSTORE_OVERLAP
+        __syncthreads();
+#endif
+
+        temp       = s_in[bi]; 
+
+        if (traits::isExclusive())
+        {
+            tempData.x = temp;
+            tempData.y = traits::op(temp, threadScan1[0]);
+            tempData.z = traits::op(temp, threadScan1[1]);
+            tempData.w = traits::op(temp, threadScan1[2]);
+        }
+        else
+        {
+            tempData.x = traits::op(temp, threadScan1[0]);
+            tempData.y = traits::op(temp, threadScan1[1]);
+            tempData.z = traits::op(temp, threadScan1[2]);
+            tempData.w = traits::op(temp, threadScan1[3]);
+        }
+
+        i = biDev * 4;
+        if (traits::isFullBlock() || i + 3 < numElements)
+        {
+            outData[biDev] = tempData;
+        }
+        else 
+        {
+            // we can't use vec4 because the original array isn't a multiple of 
+            // 4 elements
+            if ( i    < numElements) { d_out[i]   = tempData.x;
+            if ((i+1) < numElements) { d_out[i+1] = tempData.y;
+            if ((i+2) < numElements) { d_out[i+2] = tempData.z; } } }
+        }
+    }
+}
+
+/** @brief Scan all warps of a CTA without synchronization
+  * 
+  * The warp-scan algorithm breaks a block of data into warp-sized chunks, and
+  * scans the chunks independently with a warp of threads each.  Because warps
+  * execute instructions in SIMD fashion, there is no need to synchronize in 
+  * order to share data within a warp (only across warps).  Also, in SIMD the 
+  * most efficient algorithm is a step-efficient algorithm.  Therefore, within
+  * each warp we use a Hillis-and-Steele-style scan that takes log2(N) steps
+  * to scan the warp [Daniel Hillis and Guy Steele 1986], rather than the 
+  * work-efficient tree-based algorithm described by Guy Blelloch [1990] that 
+  * takes 2 * log(N) steps and is in general more complex to implement.  
+  * Previous versions of CUDPP used the Blelloch algorithm.  For current GPUs, 
+  * the warp size is 32, so this takes five steps per warp.
+  *
+  * Each thread is responsible for a single element of the array to be scanned.
+  * Each thread inputs a single value to the scan via \a val and returns 
+  * its own scanned result element.  The threads of each warp cooperate 
+  * via the shared memory array \a s_data to scan WARP_SIZE elements.
+  *
+  * Template parameter \a maxlevel allows this warpscan to be performed on
+  * partial warps.  For example, if only the first 8 elements of each warp need
+  * to be scanned, then warpscan only performs log2(8)=3 steps rather than 5.
+  *
+  * The computation uses 2 * WARP_SIZE elements of shared memory per warp to
+  * enable warps to offset beyond their input data and receive the identity 
+  * element without using any branch instructions.
+  * 
+  * \note s_data is declared volatile here to prevent the compiler from 
+  * optimizing away writes to shared memory, and ensure correct intrawarp 
+  * communication in the absence of __syncthreads.
+  *
+  * @return The result of the warp scan for the current thread
+  * @param[in] val The current threads's input to the scan
+  * @param[in,out] s_data A pointer to a temporary shared array of 2*CTA_SIZE 
+  * elements used to compute the warp scans
+  */
+template<class T, class traits,int maxlevel>
+__device__ T warpscan(T val, volatile T* s_data)
+{
+    // The following is the same as 2 * 32 * warpId + threadInWarp = 
+    // 64*(threadIdx.x >> 5) + (threadIdx.x & (WARP_SIZE-1))
+    int idx = 2 * threadIdx.x - (threadIdx.x & (WARP_SIZE-1));
+    s_data[idx] = traits::identity();
+    idx += WARP_SIZE;
+    T t = s_data[idx] = val;  __EMUSYNC;
+
+        // This code is needed because the warp size of device emulation
+        // is only 1 thread, so sync-less cooperation within a warp doesn't 
+        // work.
+#ifdef __DEVICE_EMULATION__
+    t = s_data[idx -  1]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx -  2]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx -  4]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx -  8]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+    t = s_data[idx - 16]; __EMUSYNC; 
+    s_data[idx] = traits::op(s_data[idx],t); __EMUSYNC;
+#else
+    if (0 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 1]); }
+    if (1 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 2]); }
+    if (2 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 4]); }
+    if (3 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx - 8]); }
+    if (4 <= maxlevel) { s_data[idx] = t = traits::op(t, s_data[idx -16]); }
+#endif
+
+    return s_data[idx-1];      // convert inclusive -> exclusive
+}
+
+/** @brief Perform a full CTA scan using the warp-scan algorithm
+  * 
+  * As described in the comment for warpscan(), the warp-scan algorithm breaks 
+  * a block of data into warp-sized chunks, and scans the chunks independently 
+  * with a warp of threads each.  To complete the scan, each warp <i>j</i> then 
+  * writes its last element to element <i>j</i> of a temporary shared array.
+  * Then a single warp exclusive-scans these "warp sums".  Finally, each thread
+  * adds the result of the warp sum scan to the result of the scan from the 
+  * first pass.
+  *
+  * Because we scan 2*CTA_SIZE elements per thread, we have to call warpscan
+  * twice.
+  *
+  * @param x The first input value for the current thread
+  * @param y The second input value for the current thread
+  * @param s_data Temporary shared memory space of 2*CTA_SIZE elements for 
+  * performing the scan
+  */
+template <class T, class traits>
+__device__ void scanWarps(T x, T y, 
+                          T *s_data)
+{       
+    T val  = warpscan<T, traits, 4>(x, s_data);
+    __syncthreads(); 
+    T val2 = warpscan<T, traits, 4>(y, s_data);
+    
+    int idx = threadIdx.x;
+
+    if ((idx & 31)==31)
+    {
+        s_data[idx >> 5]                = traits::op(val, x);
+        s_data[(idx + blockDim.x) >> 5] = traits::op(val2, y);
+    }
+    __syncthreads();
+
+#ifndef __DEVICE_EMULATION__
+    if (idx < 32)
+#endif
+    {
+        s_data[idx] = warpscan<T,traits,(LOG_CTA_SIZE-LOG_WARP_SIZE+1)>(s_data[idx], s_data);
+    }
+    __syncthreads();
+
+    val  = traits::op(val, s_data[idx >> 5]);
+
+    val2 = traits::op(val2, s_data[(idx + blockDim.x) >> 5]);
+
+    __syncthreads();
+
+    s_data[idx] = val;
+    s_data[idx+blockDim.x] = val2;
+}
+
+/**
+* @brief CTA-level scan routine; scans s_data in shared memory in each thread block
+*
+* This function is the main CTA-level scan function.  It may be called by other 
+* CUDA __global__ or __device__ functions. This function scans 2 * CTA_SIZE elements.
+* Each thread is responsible for one element in each half of the input array.
+* \note This code is intended to be run on a CTA of 128 threads.  Other sizes are
+* untested.
+* 
+* @param[in] s_data The array to be scanned in shared memory
+* @param[out] d_blockSums Array of per-block sums
+* @param[in] blockSumIndex Location in \a d_blockSums to which to write this block's sum
+*/
+template <class T, class traits>
+__device__ void scanCTA(T            *s_data, 
+                        T            *d_blockSums, 
+                        unsigned int blockSumIndex)
+{
+    T val  = s_data[threadIdx.x];
+    T val2 = s_data[threadIdx.x + blockDim.x];
+    __syncthreads();     
+
+    scanWarps<T,traits>(val, val2, s_data);
+    __syncthreads();  
+
+    if (traits::writeSums() && threadIdx.x == blockDim.x - 1)
+    {
+        d_blockSums[blockSumIndex] = traits::op(val2, s_data[threadIdx.x + blockDim.x]);
+    }
+    
+    
+#ifdef __DEVICE_EMULATION__
+    // must sync in emulation mode when doing backward scans, because otherwise the 
+    // shared memory array will get reversed before the block sums are read!
+    if (traits::isBackward())
+        __syncthreads();
+#endif
+}
+
+
+/** @} */ // end scan functions
+/** @} */ // end cudpp_cta
--- a/lib/gpu/cudpp_mini/cudpp.cpp
+++ b/lib/gpu/cudpp_mini/cudpp.cpp
@ -0,0 +1,417 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision: 5632 $
+// $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * cudpp.cpp
+ *
+ * @brief Main library source file.  Implements wrappers for public
+ * interface.  
+ * 
+ * Main library source file.  Implements wrappers for public
+ * interface.  These wrappers call application-level operators.
+ * As this grows we may decide to partition into multiple source
+ * files.
+ */
+
+/**
+ * \defgroup publicInterface CUDPP Public Interface
+ * The CUDA public interface comprises the functions, structs, and enums
+ * defined in cudpp.h.  Public interface functions call functions in the
+ * \link cudpp_app Application-Level\endlink interface. The public 
+ * interface functions include Plan Interface functions and Algorithm
+ * Interface functions.  Plan Inteface functions are used for creating
+ * CUDPP Plan objects which contain configuration details, intermediate
+ * storage space, and in the case of cudppSparseMatrix(), data.  The 
+ * Algorithm Interface is the set of functions that do the real work 
+ * of CUDPP, such as cudppScan() and cudppSparseMatrixVectorMultiply.
+ *
+ * @{
+ */
+
+/** @name Algorithm Interface
+ * @{
+ */
+
+#include "cudpp.h"
+#include "cudpp_plan_manager.h"
+#include "cudpp_scan.h"
+//#include "cudpp_segscan.h"
+//#include "cudpp_compact.h"
+//#include "cudpp_spmvmult.h"
+#include "cudpp_radixsort.h"
+//#include "cudpp_rand.h"
+
+/**
+ * @brief Performs a scan operation of numElements on its input in
+ * GPU memory (d_in) and places the output in GPU memory
+ * (d_out), with the scan parameters specified in the plan pointed to by
+ * planHandle. 
+ 
+ * The input to a scan operation is an input array, a binary associative 
+ * operator (like + or max), and an identity element for that operator 
+ * (+'s identity is 0). The output of scan is the same size as its input.
+ * Informally, the output at each element is the result of operator
+ * applied to each input that comes before it. For instance, the
+ * output of sum-scan at each element is the sum of all the input
+ * elements before that input.
+ *
+ * More formally, for associative operator
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly,
+ * <var>out<sub>i</sub></var> = <var>in<sub>0</sub></var>
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
+ * <var>in<sub>1</sub></var>
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
+ * <var>in<sub>i-1</sub></var>.
+ * 
+ * CUDPP supports "exclusive" and "inclusive" scans. For the ADD operator, 
+ * an exclusive scan computes the sum of all input elements before the 
+ * current element, while an inclusive scan computes the sum of all input 
+ * elements up to and including the current element. 
+ * 
+ * Before calling scan, create an internal plan using cudppPlan().
+ * 
+ * After you are finished with the scan plan, clean up with cudppDestroyPlan(). 
+ * 
+ * @param[in] planHandle Handle to plan for this scan
+ * @param[out] d_out output of scan, in GPU memory
+ * @param[in] d_in input to scan, in GPU memory
+ * @param[in] numElements number of elements to scan
+ * 
+ * @see cudppPlan, cudppDestroyPlan
+ */
+CUDPP_DLL
+CUDPPResult cudppScan(CUDPPHandle planHandle,
+                      void        *d_out, 
+                      const void  *d_in, 
+                      size_t      numElements)
+{
+    CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
+    if (plan != NULL)
+    {
+        cudppScanDispatch(d_out, d_in, numElements, 1, plan);
+        return CUDPP_SUCCESS;
+    }
+    else
+    {    
+        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
+    }
+}
+
+/**
+ * @brief Performs a segmented scan operation of numElements on its input in
+ * GPU memory (d_idata) and places the output in GPU memory
+ * (d_out), with the scan parameters specified in the plan pointed to by
+ * planHandle. 
+ 
+ * The input to a segmented scan operation is an input array of data,
+ * an input array of flags which demarcate segments, a binary associative 
+ * operator (like + or max), and an identity element for that operator 
+ * (+'s identity is 0). The array of flags is the same length as the input
+ * with 1 marking the the first element of a segment and 0 otherwise. The 
+ * output of segmented scan is the same size as its input. Informally, the 
+ * output at each element is the result of operator applied to each input 
+ * that comes before it in that segment. For instance, the output of 
+ * segmented sum-scan at each element is the sum of all the input elements 
+ * before that input in that segment.
+ *
+ * More formally, for associative operator
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly,
+ * <var>out<sub>i</sub></var> = <var>in<sub>k</sub></var>
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
+ * <var>in<sub>k+1</sub></var>
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly ...
+ * @htmlonly&oplus;@endhtmlonly@latexonly$\oplus$@endlatexonly
+ * <var>in<sub>i-1</sub></var>.
+ * <i>k</i> is the index of the first element of the segment in which <i>i</i> lies
+ * 
+ * We support both "exclusive" and "inclusive" variants. For a segmented sum-scan, 
+ * the exclusive variant computes the sum of all input elements before the 
+ * current element in that segment, while the inclusive variant computes the 
+ * sum of all input elements up to and including the current element, in 
+ * that segment. 
+ * 
+ * Before calling segmented scan, create an internal plan using cudppPlan().
+ * 
+ * After you are finished with the scan plan, clean up with cudppDestroyPlan(). 
+ * @param[in] planHandle Handle to plan for this scan
+ * @param[out] d_out output of segmented scan, in GPU memory
+ * @param[in] d_idata input data to segmented scan, in GPU memory
+ * @param[in] d_iflags input flags to segmented scan, in GPU memory
+ * @param[in] numElements number of elements to perform segmented scan on
+ * 
+ * @see cudppPlan, cudppDestroyPlan
+ 
+CUDPP_DLL
+CUDPPResult cudppSegmentedScan(CUDPPHandle        planHandle,
+                               void               *d_out, 
+                               const void         *d_idata,
+                               const unsigned int *d_iflags,
+                               size_t             numElements)
+{
+    CUDPPSegmentedScanPlan *plan = 
+        (CUDPPSegmentedScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
+    if (plan != NULL)
+    {
+        cudppSegmentedScanDispatch(d_out, d_idata, d_iflags, numElements, plan);
+        return CUDPP_SUCCESS;
+    }
+    else
+    {    
+        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
+    }
+}
+*/
+/**
+ * @brief Performs numRows parallel scan operations of numElements
+ * each on its input (d_in) and places the output in d_out,
+ * with the scan parameters set by config. Exactly like cudppScan 
+ * except that it runs on multiple rows in parallel.
+ * 
+ * Note that to achieve good performance with cudppMultiScan one should
+ * allocate the device arrays passed to it so that all rows are aligned
+ * to the correct boundaries for the architecture the app is running on.
+ * The easy way to do this is to use cudaMallocPitch() to allocate a 
+ * 2D array on the device.  Use the \a rowPitch parameter to cudppPlan() 
+ * to specify this pitch. The easiest way is to pass the device pitch 
+ * returned by cudaMallocPitch to cudppPlan() via \a rowPitch.
+ * 
+ * @param[in] planHandle handle to CUDPPScanPlan
+ * @param[out] d_out output of scan, in GPU memory
+ * @param[in] d_in input to scan, in GPU memory
+ * @param[in] numElements number of elements (per row) to scan
+ * @param[in] numRows number of rows to scan in parallel
+ * 
+ * @see cudppScan, cudppPlan
+
+CUDPP_DLL
+CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
+                            void       *d_out, 
+                            const void *d_in, 
+                            size_t     numElements,
+                            size_t     numRows)
+{
+    CUDPPScanPlan *plan = (CUDPPScanPlan*)CUDPPPlanManager::GetPlan(planHandle);
+    if (plan != NULL)
+    {
+        cudppScanDispatch(d_out, d_in, numElements, numRows, plan);
+        return CUDPP_SUCCESS;
+    }
+    else
+    {    
+        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
+    }
+}
+*/
+
+/**
+ * @brief Given an array \a d_in and an array of 1/0 flags in \a 
+ * deviceValid, returns a compacted array in \a d_out of corresponding
+ * only the "valid" values from \a d_in.
+ * 
+ * Takes as input an array of elements in GPU memory
+ * (\a d_in) and an equal-sized unsigned int array in GPU memory
+ * (\a deviceValid) that indicate which of those input elements are
+ * valid. The output is a packed array, in GPU memory, of only those
+ * elements marked as valid.
+ * 
+ * Internally, uses cudppScan.
+ *
+ * Example:
+ * \code
+ * d_in    = [ a b c d e f ]
+ * deviceValid = [ 1 0 1 1 0 1 ]
+ * d_out   = [ a c d f ]
+ * \endcode
+ *
+ * @todo [MJH] We need to evaluate whether cudppCompact should be a core member
+ * of the public interface.  It's not clear to me that what the user always
+ * wants is a final compacted array.  Often one just wants the array of indices
+ * to which each input element should go in the output. The split() routine used
+ * in radix sort might make more sense to expose.
+ * 
+ * @param[in] planHandle handle to CUDPPCompactPlan
+ * @param[out] d_out compacted output
+ * @param[out] d_numValidElements set during cudppCompact; is set with the
+ * number of elements valid flags in the d_isValid input array
+ * @param[in] d_in input to compact
+ * @param[in] d_isValid which elements in d_in are valid
+ * @param[in] numElements number of elements in d_in
+
+CUDPP_DLL
+CUDPPResult cudppCompact(CUDPPHandle        planHandle,
+                         void               *d_out, 
+                         size_t             *d_numValidElements,
+                         const void         *d_in, 
+                         const unsigned int *d_isValid,
+                         size_t             numElements)
+{
+    CUDPPCompactPlan *plan = (CUDPPCompactPlan*)CUDPPPlanManager::GetPlan(planHandle);
+    if (plan != NULL)
+    {
+        cudppCompactDispatch(d_out, d_numValidElements, d_in, d_isValid, 
+            numElements, plan);
+        return CUDPP_SUCCESS;
+    }
+    else
+    {
+        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
+    }
+}
+*/
+/**
+ * @brief Sorts key-value pairs or keys only
+ * 
+ * Takes as input an array of keys in GPU memory
+ * (d_keys) and an optional array of corresponding values,
+ * and outputs sorted arrays of keys and (optionally) values in place. 
+ * Key-value and key-only sort is selected through the configuration of 
+ * the plan, using the options CUDPP_OPTION_KEYS_ONLY and 
+ * CUDPP_OPTION_KEY_VALUE_PAIRS.
+ *
+ * Supported key types are CUDPP_FLOAT and CUDPP_UINT.  Values can be
+ * any 32-bit type (internally, values are treated only as a payload
+ * and cast to unsigned int).
+ *
+ * @todo Determine if we need to provide an "out of place" sort interface.
+ * 
+ * @param[in] planHandle handle to CUDPPSortPlan
+ * @param[out] d_keys keys by which key-value pairs will be sorted
+ * @param[in] d_values values to be sorted
+ * @param[in] keyBits the number of least significant bits in each element 
+ *            of d_keys to sort by
+ * @param[in] numElements number of elements in d_keys and d_values
+ *
+ * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
+ */
+CUDPP_DLL
+CUDPPResult cudppSort(CUDPPHandle planHandle,
+                      void        *d_keys,
+                      void        *d_values,                      
+                      int         keyBits,
+                      size_t      numElements)
+{
+    CUDPPRadixSortPlan *plan = (CUDPPRadixSortPlan*)CUDPPPlanManager::GetPlan(planHandle);
+    if (plan != NULL)
+    {
+        cudppRadixSortDispatch(d_keys, d_values, numElements, keyBits, plan);
+        return CUDPP_SUCCESS;
+    }
+    else
+    {
+        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
+    }
+}
+
+/** @brief Perform matrix-vector multiply y = A*x for arbitrary sparse matrix A and vector x
+  *
+  * Given a matrix object handle (which has been initialized using cudppSparseMatrix()),
+  * This function multiplies the input vector \a d_x by the matrix referred to by
+  * \a sparseMatrixHandle, returning the result in \a d_y.
+  *
+  * @param sparseMatrixHandle Handle to a sparse matrix object created with cudppSparseMatrix()
+  * @param d_y The output vector, y
+  * @param d_x The input vector, x
+  * 
+  * @see cudppSparseMatrix, cudppDestroySparseMatrix
+  
+CUDPP_DLL
+CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle        sparseMatrixHandle,
+                                            void               *d_y,
+                                            const void         *d_x)
+{
+    CUDPPSparseMatrixVectorMultiplyPlan *plan = 
+        (CUDPPSparseMatrixVectorMultiplyPlan*)CUDPPPlanManager::GetPlan(sparseMatrixHandle);
+    
+    if (plan != NULL)
+    {
+        cudppSparseMatrixVectorMultiplyDispatch(d_y, d_x, plan);
+        return CUDPP_SUCCESS;
+    }
+    else
+    {
+        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors.
+    }
+}
+*/
+/**
+ * @brief Rand puts \a numElements random 32-bit elements into \a d_out
+ *
+ 
+ * Outputs \a numElements random values to \a d_out. \a d_out must be of
+ * type unsigned int, allocated in device memory.
+ * 
+ * The algorithm used for the random number generation is stored in \a planHandle.
+ * Depending on the specification of the pseudo random number generator(PRNG),
+ * the generator may have one or more seeds.  To set the seed, use cudppRandSeed().
+ * 
+ * @todo Currently only MD5 PRNG is supported.  We may provide more rand routines in 
+ * the future.
+ *
+ * @param[in] planHandle Handle to plan for rand
+ * @param[in] numElements number of elements in d_out.
+ * @param[out] d_out output of rand, in GPU memory.  Should be an array of unsigned integers.
+ *
+ * @see cudppPlan, CUDPPConfiguration, CUDPPAlgorithm
+ 
+CUDPP_DLL
+CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements)
+{
+    CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
+    if(plan != NULL)
+    {
+        //dispatch the rand algorithm here
+        cudppRandDispatch(d_out, numElements, plan);
+        return CUDPP_SUCCESS;
+    }
+    else
+        return CUDPP_ERROR_UNKNOWN; //! @todo Return more specific errors
+}
+*/
+
+/**@brief Sets the seed used for rand
+ *
+ * The seed is crucial to any random number generator as it allows a 
+ * sequence of random numbers to be replicated.  Since there may be 
+ * multiple different rand algorithms in CUDPP, cudppRandSeed 
+ * uses \a planHandle to determine which seed to set.  Each rand 
+ * algorithm has its own  unique set of seeds depending on what 
+ * the algorithm needs.
+ *
+ * @param[in] planHandle the handle to the plan which specifies which rand seed to set
+ * @param[in] seed the value which the internal cudpp seed will be set to
+ 
+CUDPP_DLL
+CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed)
+{
+    CUDPPRandPlan * plan = (CUDPPRandPlan *) CUDPPPlanManager::GetPlan(planHandle);
+    //switch on the plan to figure out which seed to update
+    switch(plan->m_config.algorithm)
+    {
+    case CUDPP_RAND_MD5:
+        plan->m_seed = seed;
+        break;
+    default:
+        break;
+    }
+
+    return CUDPP_SUCCESS;
+}//end cudppRandSeed
+*/
+/** @} */ // end Algorithm Interface
+/** @} */ // end of publicInterface group
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
+
--- a/lib/gpu/cudpp_mini/cudpp.h
+++ b/lib/gpu/cudpp_mini/cudpp.h
@ -0,0 +1,525 @@
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * cudpp.h
+ * 
+ * @brief Main library header file.  Defines public interface.
+ *
+ * The CUDPP public interface is a C-only interface to enable 
+ * linking with code written in other languages (e.g. C, C++, 
+ * and Fortran).  While the internals of CUDPP are not limited 
+ * to C (C++ features are used), the public interface is 
+ * entirely C (thus it is declared "extern C").
+ */
+
+/**
+ * \mainpage
+ *
+ * \section introduction Introduction
+ * 
+ * CUDPP is the CUDA Data Parallel Primitives Library. CUDPP is a
+ * library of data-parallel algorithm primitives such as 
+ * parallel-prefix-sum ("scan"), parallel sort and parallel reduction. 
+ * Primitives such as these are important building blocks for a wide 
+ * variety of data-parallel algorithms, including sorting, stream 
+ * compaction, and building data structures such as trees and 
+ * summed-area tables.
+ *
+ * \section overview Overview Presentation
+ * 
+ * A brief set of slides that describe the features, design principles,
+ * applications and impact of CUDPP is available here:
+ * <a href="http://cudpp.googlecode.com/svn/trunk/cudpp/doc/CUDPP_slides.pdf">CUDPP Presentation</a>.
+ *
+ * \section homepage Homepage
+ * Homepage for CUDPP: http://code.google.com/p/cudpp
+ * 
+ * Announcements and discussion of CUDPP are hosted on the
+ * <a href="http://groups.google.com/group/cudpp?hl=en">CUDPP Google Group</a>.
+ * 
+ * \section getting-started Getting Started with CUDPP
+ *
+ * You may want to start by browsing the \link publicInterface CUDPP Public 
+ * Interface\endlink. For information on building CUDPP, see 
+ * \ref building-cudpp "Building CUDPP".
+ *
+ * The "apps" subdirectory included with CUDPP has a few source code samples 
+ * that use CUDPP:
+ * - \ref example_simpleCUDPP "simpleCUDPP", a simple example of using 
+ * cudppScan()
+ * - satGL, an example of using cudppMultiScan() to generate a summed-area 
+ * table (SAT) of a scene rendered in real time.  The SAT is then used to simulate 
+ * depth of field blur.
+ * - cudpp_testrig, a comprehensive test application for all the functionality 
+ * of CUDPP
+ *
+ * We have also provided a code walkthrough of the 
+ * \ref example_simpleCUDPP "simpleCUDPP" example.
+ *
+ * \section getting-help Getting Help and Reporting Problems
+ *
+ * To get help using CUDPP, please use the 
+ * <a href="http://groups.google.com/group/cudpp?hl=en">CUDPP Google Group</a>.
+ *
+ * To report CUDPP bugs or request features, you may use either the above 
+ * CUDPP Google Group, or you can file an issue directly using 
+ * <a href="http://code.google.com/p/cudpp/issues/list">Google Code</a>.
+ *
+ * \section release-notes Release Notes
+ *
+ * For specific release details see the \ref changelog "Change Log".
+ *
+ * This release (1.1.1) is a bugfix release to CUDPP 1.1 that includes
+ * fixes to support CUDA 3.0 and the new NVIDIA Fermi architecture, 
+ * including GeForce 400 series and Tesla 20 series GPUs.  It also has
+ * bug fixes for 64-bit OSes.
+ * 
+ * \section opSys Operating System Support
+ * 
+ * This release (1.1.1) has been thoroughly tested on the following OSes.
+ * - Windows XP (32-bit) (CUDA 2.2, 3.0)
+ * - Windows 7 (64-bit) (CUDA 3.0)
+ * - Redhat Enterprise Linux 5 (64-bit) (CUDA 3.0)
+ * - and Mac OS X 10.6 (Snow Leopard, 64-bit) (CUDA 3.0)
+ *
+ * We expect CUDPP to build and run correctly on other flavors of Linux 
+ * and Windows, but these are not actively tested by the developers at 
+ * this time.
+ *
+ * Notes: CUDPP is not compatible with CUDA 2.1.  A compiler bug in 2.1 
+ * causes the compiler to crash.  Also, starting with CUDPP 1.1.1, we are 
+ * no longer testing CUDA device emulation, because it is deprecated in 
+ * CUDA 3.0 and will be removed from future CUDA versions.  
+ *
+ * \section cuda CUDA
+ * CUDPP is implemented in
+ * <a href="http://developer.nvidia.com/cuda">CUDA C/C++</a>. It requires the 
+ * CUDA Toolkit version 2.2 or later.  Please see the NVIDIA 
+ * <a href="http://developer.nvidia.com/cuda">CUDA</a> homepage to download 
+ * CUDA as well as the CUDA Programming Guide and CUDA SDK, which includes many 
+ * CUDA code examples.  Some of the samples in the CUDA SDK (including 
+ * "marchingCubes", "lineOfSight", and radixSort) also use CUDPP.
+ *
+ * \section design-goals Design Goals
+ * Design goals for CUDPP include:
+ * 
+ * - Performance. We aim to provide best-of-class performance for our
+ *   primitives. We welcome suggestions and contributions that will improve 
+ *   CUDPP performance. We also want to provide primitives that can be easily 
+ *   benchmarked, and compared against other implementations on GPUs and other 
+ *   processors.
+ * - Modularity. We want our primitives to be easily included in other
+ *   applications. To that end we have made the following design decisions:
+ *   - CUDPP is provided as a library that can link against other applications. 
+ *   - CUDPP calls run on the GPU on GPU data. Thus they can be used
+ *     as standalone calls on the GPU (on GPU data initialized by the 
+ *     calling application) and, more importantly, as GPU components in larger 
+ *     CPU/GPU applications.
+ *   - CUDPP is implemented as 4 layers:
+ *     -# The \link publicInterface Public Interface\endlink is the external 
+ *        library interface, which is the intended entry point for most 
+ *        applications. The public interface calls into the 
+ *        \link cudpp_app Application-Level API\endlink.
+ *     -# The \link cudpp_app Application-Level API\endlink comprises functions
+ *        callable from CPU code. These functions execute code jointly on the 
+ *        CPU (host) and the GPU by calling into the 
+ *        \link cudpp_kernel Kernel-Level API\endlink below them.
+ *     -# The \link cudpp_kernel Kernel-Level API\endlink comprises functions
+ *        that run entirely on the GPU across an entire grid of thread blocks.  
+ *        These functions may call into the \link cudpp_cta CTA-Level API\endlink 
+ *        below them.
+ *     -# The \link cudpp_cta CTA-Level API\endlink comprises functions that run 
+ *        entirely on the GPU within a single Cooperative Thread Array (CTA, 
+ *        aka thread block). These are low-level functions that implement core 
+ *        data-parallel algorithms, typically by processing data within shared 
+ *        (CUDA \c __shared__) memory.
+ *
+ * Programmers may use any of the lower three CUDPP layers in their own 
+ * programs by building the source directly into their application.  However, 
+ * the typical usage of CUDPP is to link to the library and invoke functions in 
+ * the CUDPP \link publicInterface Public Interface\endlink, as in the 
+ * \ref example_simpleCUDPP "simpleCUDPP", satGL, and cudpp_testrig application 
+ * examples included in the CUDPP distribution.
+ *
+ * In the future, if and when CUDA supports building device-level libraries, we 
+ * hope to enhance CUDPP to ease the use of CUDPP internal algorithms at all 
+ * levels.
+ *
+ * \subsection uses Use Cases
+ * We expect the normal use of CUDPP will be in one of two ways:
+ * -# Linking the CUDPP library against another application. 
+ * -# Running our "test" application, cudpp_testrig, that exercises
+ *   CUDPP functionality.
+ *
+ * \section references References
+ * The following publications describe work incorporated in CUDPP.
+ * 
+ * - Mark Harris, Shubhabrata Sengupta, and John D. Owens. "Parallel Prefix Sum (Scan) with CUDA". In Hubert Nguyen, editor, <i>GPU Gems 3</i>, chapter 39, pages 851&ndash;876. Addison Wesley, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=916
+ * - Shubhabrata Sengupta, Mark Harris, Yao Zhang, and John D. Owens. "Scan Primitives for GPU Computing". In <i>Graphics Hardware 2007</i>, pages 97&ndash;106, August 2007. http://graphics.idav.ucdavis.edu/publications/print_pub?pub_id=915
+ * - Shubhabrata Sengupta, Mark Harris, and Michael Garland. "Efficient parallel scan algorithms for GPUs". NVIDIA Technical Report NVR-2008-003, December 2008. http://mgarland.org/papers.html#segscan-tr
+ * - Nadathur Satish, Mark Harris, and Michael Garland. "Designing Efficient Sorting Algorithms for Manycore GPUs". In <i>Proceedings of the 23rd IEEE International Parallel & Distributed Processing Symposium</i>, May 2009. http://mgarland.org/papers.html#gpusort
+ * - Stanley Tzeng, Li-Yi Wei. "Parallel White Noise Generation on a GPU via Cryptographic Hash". In <i>Proceedings of the 2008 Symposium on Interactive 3D Graphics and Games</i>, pages 79&ndash;87, February 2008. http://research.microsoft.com/apps/pubs/default.aspx?id=70502
+ *
+ * Many researchers are using CUDPP in their work, and there are many publications 
+ * that have used it \ref cudpp_refs "(references)". If your work uses CUDPP, please 
+ * let us know by sending us a reference (preferably in BibTeX format) to your work.
+ * 
+ * \section citing Citing CUDPP
+ *
+ * If you make use of CUDPP primitives in your work and want to cite
+ * CUDPP (thanks!), we would prefer for you to cite the appropriate 
+ * papers above, since they form the core of CUDPP. To be more specific, 
+ * the GPU Gems paper describes (unsegmented) scan, multi-scan for 
+ * summed-area tables, and stream compaction. The NVIDIA technical report 
+ * describes the current scan and segmented scan algorithms used in the 
+ * library, and the Graphics Hardware paper describes an earlier 
+ * implementation of segmented scan, quicksort, and sparse matrix-vector 
+ * multiply. The IPDPS paper describes the radix sort used in CUDPP, and 
+ * the I3D paper describes the random number generation algorithm.
+ *
+ * \section credits Credits
+ * \subsection developers CUDPP Developers
+ * - <a href="http://www.markmark.net">Mark Harris</a>, NVIDIA Corporation
+ * - <a href="http://www.ece.ucdavis.edu/~jowens/">John D. Owens</a>, University of California, Davis
+ * - <a href="http://graphics.cs.ucdavis.edu/~shubho/">Shubho Sengupta</a>, University of California, Davis
+ * - Stanley Tzeng,   University of California, Davis
+ * - <a href="http://www.ece.ucdavis.edu/~yaozhang/">Yao Zhang</a>,       University of California, Davis
+ * - <a href="http://www.ece.ucdavis.edu/~aaldavid/">Andrew Davidson</a>, University of California, Davis (formerly Louisiana State University)
+ * 
+ * \subsection contributors Other CUDPP Contributors
+ * - <a href="http://www.eecs.berkeley.edu/~nrsatish/">Nadatur Satish</a>,  University of California, Berkeley
+ *
+ * \subsection acknowledgments Acknowledgments
+ *
+ * Thanks to Jim Ahrens, Timo Aila, Nathan Bell, Ian Buck, Guy Blelloch, 
+ * Jeff Bolz, Michael Garland, Jeff Inman, Eric Lengyel, Samuli Laine, 
+ * David Luebke, Pat McCormick, and Richard Vuduc for their contributions 
+ * during the development of this library. 
+ * 
+ * CUDPP Developers from UC Davis thank their funding agencies:
+ * - Department of Energy Early Career Principal Investigator Award
+ *   DE-FG02-04ER25609
+ * - SciDAC Institute for Ultrascale Visualization (http://www.iusv.org/)
+ * - Los Alamos National Laboratory
+ * - National Science Foundation (grant 0541448)
+ * - Generous hardware donations from NVIDIA
+ *
+ * \section license-overview CUDPP Copyright and Software License
+ * CUDPP is copyright The Regents of the University of California, Davis campus 
+ * and NVIDIA Corporation.  The library, examples, and all source code are 
+ * released under the BSD license, designed to encourage reuse of this software 
+ * in other projects, both commercial and non-commercial.  For details, please 
+ * see the \ref license page. 
+ * 
+ * Note that prior to release 1.1 of CUDPP, the license used was a modified
+ * BSD license.  With release 1.1, this license was replaced with the pure BSD
+ * license to facilitate the use of open source hosting of the code.
+ */
+
+/**
+ * @page license CUDPP License
+ *
+ * \section licenseBSD CUDPP License
+ *
+ * CUDPP is released under the 
+ * <a href="http://www.opensource.org/licenses/bsd-license.php">BSD license</a>.
+ * 
+ * @include license.txt
+ *
+ */
+
+/** 
+ * @page changelog CUDPP Change Log
+ *
+ * @include changelog.txt
+ */
+
+/** 
+ * @page cudpp_refs Publications that use CUDPP
+ *
+ * @htmlinclude doc/bib/cudpp_refs.html
+ */
+
+/** 
+ * @page cudpp_refs_bib Bibliography for publications that use CUDPP
+ *
+ * @htmlinclude doc/bib/cudpp_refs_bib.html
+ */
+
+/**
+ * @page building-cudpp Building CUDPP
+ *
+ * CUDPP has currently been tested in Windows XP, Windows Vista, Mac OS X 
+ * and Linux.  See \ref release-notes for release specific platform support.
+ *
+ * \section build-win32 Building CUDPP on Windows XP
+ *
+ * CUDPP can be built using either or MSVC 8 (2005) or MSVC 9 (2008).  To 
+ * build, open cudpp/cudpp.sln. Then you can build the library 
+ * using the "build" command as you would with any other workspace. There are 
+ * four configurations: debug, release, emudebug, and emurelease.  The first 
+ * two are self-explanatory.  The second two are built to use CUDA device 
+ * emulation, meaning they will be run (slowly) on the CPU.
+ *
+ * \section build-linux Building CUDPP on Linux and Mac OS X
+ *
+ * CUDPP can be built using standard g++ and Make tools on Linux, by typing 
+ * "make" in the "cudpp/" subdirectory.  Before building CUDPP, you should 
+ * first build the CUDA Utility Library (libcutil) by typing "make; make dbg=1" 
+ * in the "common/" subdirectory.  This will generate libcutil.a and 
+ * libcutilD.a.  
+ * 
+ * The makefile for CUDPP and all sample applications take the optional 
+ * arguments "emu=1" and "dbg=1".  The former builds CUDPP for device emulation,
+ * and the latter for debugging. The two flags can be combined. "verbose=1"
+ * can be used to see all compiler output.
+ *
+ * \section build-apps Building CUDPP Sample Applications
+ * 
+ * The sample applications in the "apps/" subdirectory can be built exactly 
+ * like CUDPP is--either by opening the appropriate .sln/.vcproj file in MSVC 
+ * in Windows, or using "make" in Linux.
+ *
+ * On some Linux installations you will get linker errors relating to "-lXi"
+ * and "-lXmu".  To fix this, you will need to install libXi and libXmu.  On 
+ * Debian and Ubuntu, for example, you can simply run 
+ *  "sudo apt-get install libxi-dev", and 
+ *  "sudo apt-get install libxmu-dev"
+ * 
+ */
+
+#ifndef __CUDPP_H__
+#define __CUDPP_H__
+
+#include <stdlib.h> // for size_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief CUDPP Result codes returned by CUDPP API functions.
+ */
+enum CUDPPResult
+{
+    CUDPP_SUCCESS = 0,                 /**< No error. */
+    CUDPP_ERROR_INVALID_HANDLE,        /**< Specified handle (for example, 
+                                            to a plan) is invalid. **/
+    CUDPP_ERROR_ILLEGAL_CONFIGURATION, /**< Specified configuration is
+                                            illegal. For example, an
+                                            invalid or illogical
+                                            combination of options. */
+    CUDPP_ERROR_UNKNOWN = 9999         /**< Unknown or untraceable error. */
+};
+
+/** 
+ * @brief Options for configuring CUDPP algorithms.
+ * 
+ * @see CUDPPConfiguration, cudppPlan, CUDPPAlgorithm
+ */
+enum CUDPPOption
+{
+    CUDPP_OPTION_FORWARD   = 0x1,  /**< Algorithms operate forward:
+                                    * from start to end of input
+                                    * array */
+    CUDPP_OPTION_BACKWARD  = 0x2,  /**< Algorithms operate backward:
+                                    * from end to start of array */
+    CUDPP_OPTION_EXCLUSIVE = 0x4,  /**< Exclusive (for scans) - scan
+                                    * includes all elements up to (but
+                                    * not including) the current
+                                    * element */
+    CUDPP_OPTION_INCLUSIVE = 0x8,  /**< Inclusive (for scans) - scan
+                                    * includes all elements up to and
+                                    * including the current element */
+    CUDPP_OPTION_CTA_LOCAL = 0x10, /**< Algorithm performed only on
+                                    * the CTAs (blocks) with no
+                                    * communication between blocks.
+                                    * @todo Currently ignored. */
+    CUDPP_OPTION_KEYS_ONLY = 0x20, /**< No associated value to a key 
+                                    * (for global radix sort) */
+    CUDPP_OPTION_KEY_VALUE_PAIRS = 0x40, /**< Each key has an associated value */
+};
+
+
+/** 
+ * @brief Datatypes supported by CUDPP algorithms.
+ *
+ * @see CUDPPConfiguration, cudppPlan
+ */
+enum CUDPPDatatype
+{
+    CUDPP_CHAR,     //!< Character type (C char)
+    CUDPP_UCHAR,    //!< Unsigned character (byte) type (C unsigned char)
+    CUDPP_INT,      //!< Integer type (C int)
+    CUDPP_UINT,     //!< Unsigned integer type (C unsigned int)
+    CUDPP_FLOAT     //!< Float type (C float)
+};
+
+/** 
+ * @brief Operators supported by CUDPP algorithms (currently scan and
+ * segmented scan).
+ *
+ * These are all binary associative operators.
+ *
+ * @see CUDPPConfiguration, cudppPlan
+ */
+enum CUDPPOperator
+{
+    CUDPP_ADD,      //!< Addition of two operands
+    CUDPP_MULTIPLY, //!< Multiplication of two operands
+    CUDPP_MIN,      //!< Minimum of two operands
+    CUDPP_MAX       //!< Maximum of two operands
+};
+
+/**
+* @brief Algorithms supported by CUDPP.  Used to create appropriate plans using
+* cudppPlan.
+* 
+* @see CUDPPConfiguration, cudppPlan
+*/
+enum CUDPPAlgorithm
+{
+    CUDPP_SCAN,              //!< Scan or prefix-sum
+    CUDPP_SEGMENTED_SCAN,    //!< Segmented scan
+    CUDPP_COMPACT,           //!< Stream compact
+    CUDPP_REDUCE,            //!< Parallel reduction (NOTE: currently unimplemented)
+    CUDPP_SORT_RADIX,        //!< Radix sort
+    CUDPP_SPMVMULT,          //!< Sparse matrix-dense vector multiplication
+    CUDPP_RAND_MD5,          //!< PseudoRandom Number Generator using MD5 hash algorithm
+    CUDPP_ALGORITHM_INVALID, //!< Placeholder at end of enum
+};
+
+/**
+* @brief Configuration struct used to specify algorithm, datatype,
+* operator, and options when creating a plan for CUDPP algorithms.
+*
+* @see cudppPlan
+*/
+struct CUDPPConfiguration
+{
+    CUDPPAlgorithm algorithm; //!< The algorithm to be used
+    CUDPPOperator  op;        //!< The numerical operator to be applied
+    CUDPPDatatype  datatype;  //!< The datatype of the input arrays
+    unsigned int   options;   //!< Options to configure the algorithm
+};
+
+#define CUDPP_INVALID_HANDLE 0xC0DABAD1
+typedef size_t CUDPPHandle;
+
+/* To use CUDPP as a static library, #define CUDPP_STATIC_LIB before 
+ * including cudpp.h
+ */
+#define CUDPP_STATIC_LIB
+#ifndef CUDPP_DLL
+    #ifdef _WIN32
+        #ifdef CUDPP_STATIC_LIB
+            #define CUDPP_DLL
+        #else
+        #ifdef BUILD_DLL
+            #define CUDPP_DLL __declspec(dllexport)
+        #else
+            #define CUDPP_DLL __declspec(dllimport)
+        #endif
+        #endif
+    #else
+        #define CUDPP_DLL
+    #endif
+#endif
+
+// Plan allocation (for scan, sort, and compact)
+
+CUDPP_DLL
+CUDPPResult cudppPlan(CUDPPHandle        *planHandle, 
+                      CUDPPConfiguration config, 
+                      size_t             n, 
+                      size_t             rows, 
+                      size_t             rowPitch);
+
+CUDPP_DLL
+CUDPPResult cudppDestroyPlan(CUDPPHandle plan);
+
+// Scan and sort algorithms
+
+CUDPP_DLL
+CUDPPResult cudppScan(CUDPPHandle planHandle,
+                      void        *d_out, 
+                      const void  *d_in, 
+                      size_t      numElements);
+
+CUDPP_DLL
+CUDPPResult cudppMultiScan(CUDPPHandle planHandle,
+                           void        *d_out, 
+                           const void  *d_in, 
+                           size_t      numElements,
+                           size_t      numRows);
+
+CUDPP_DLL
+CUDPPResult cudppSegmentedScan(CUDPPHandle        planHandle,
+                               void               *d_out, 
+                               const void         *d_idata,
+                               const unsigned int *d_iflags,
+                               size_t             numElements);
+
+CUDPP_DLL
+CUDPPResult cudppCompact(CUDPPHandle        planHandle,
+                         void               *d_out, 
+                         size_t             *d_numValidElements,
+                         const void         *d_in, 
+                         const unsigned int *d_isValid,
+                         size_t             numElements);
+
+CUDPP_DLL
+CUDPPResult cudppSort(CUDPPHandle planHandle,
+                      void        *d_keys,                                          
+                      void        *d_values,                                                                       
+                      int         keybits,
+                      size_t      numElements);
+
+// Sparse matrix allocation
+
+CUDPP_DLL
+CUDPPResult cudppSparseMatrix(CUDPPHandle        *sparseMatrixHandle, 
+                              CUDPPConfiguration config, 
+                              size_t             n, 
+                              size_t             rows, 
+                              const void         *A,
+                              const unsigned int *h_rowIndices,
+                              const unsigned int *h_indices);
+
+CUDPP_DLL
+CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle);
+
+// Sparse matrix-vector algorithms
+
+CUDPP_DLL
+CUDPPResult cudppSparseMatrixVectorMultiply(CUDPPHandle sparseMatrixHandle,
+                                            void        *d_y,
+                                            const void  *d_x);
+
+// random number generation algorithms
+CUDPP_DLL
+CUDPPResult cudppRand(CUDPPHandle planHandle,void * d_out, size_t numElements);
+
+CUDPP_DLL
+CUDPPResult cudppRandSeed(const CUDPPHandle planHandle, unsigned int seed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/lib/gpu/cudpp_mini/cudpp_globals.h
+++ b/lib/gpu/cudpp_mini/cudpp_globals.h
@ -0,0 +1,66 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * cudpp_globals.h
+ *
+ * @brief Global declarations defining machine characteristics of GPU target
+ * These are currently set for best performance on G8X GPUs.  The optimal 
+ * parameters may change on future GPUs. In the future, we hope to make
+ * CUDPP a self-tuning library.
+ */
+
+#ifndef __CUDPP_GLOBALS_H__
+#define __CUDPP_GLOBALS_H__
+
+const int NUM_BANKS = 16;                        /**< Number of shared memory banks */
+const int LOG_NUM_BANKS = 4;                     /**< log_2(NUM_BANKS) */
+const int CTA_SIZE = 128;                        /**< Number of threads in a CTA */
+const int WARP_SIZE = 32;                        /**< Number of threads in a warp */
+const int LOG_CTA_SIZE = 7;                      /**< log_2(CTA_SIZE) */
+const int LOG_WARP_SIZE = 5;                     /**< log_2(WARP_SIZE) */
+const int LOG_SIZEOF_FLOAT = 2;                  /**< log_2(sizeof(float)) */
+const int SCAN_ELTS_PER_THREAD = 8;              /**< Number of elements per scan thread */
+const int SEGSCAN_ELTS_PER_THREAD = 8;     /**< Number of elements per segmented scan thread */
+
+const int maxSharedMemoryPerBlock = 16384; /**< Number of bytes of shared 
+                                              memory in each block */
+const int maxThreadsPerBlock = CTA_SIZE;   /**< Maximum number of
+                                             * threads in a CTA */
+
+/**
+* @brief Macro to insert necessary __syncthreads() in device emulation mode
+*/
+#ifdef __DEVICE_EMULATION__
+#define __EMUSYNC __syncthreads()
+#else
+#define __EMUSYNC
+#endif
+
+
+#define AVOID_BANK_CONFLICTS /**< Set if by default, we want our
+                              * shared memory allocation to perform
+                              * additional computation to avoid bank
+                              * conflicts */
+
+#ifdef AVOID_BANK_CONFLICTS
+#define CONFLICT_FREE_OFFSET(index) ((index) >> LOG_NUM_BANKS)
+#else
+#define CONFLICT_FREE_OFFSET(index) (0)
+#endif
+
+#endif // __CUDPP_GLOBALS_H__
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
+++ b/lib/gpu/cudpp_mini/cudpp_maximal_launch.cpp
@ -0,0 +1,94 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#include "cudpp_maximal_launch.h"
+
+inline size_t min(size_t x, size_t y)
+{
+    return (x <= y) ? x : y;
+}
+
+inline size_t max(size_t x, size_t y)
+{
+    return (x >= y) ? x : y;
+}
+
+// computes next highest multiple of f from x
+inline size_t multiple(size_t x, size_t f)
+{
+    return ((x + (f-1)) / f);
+}
+
+
+// MS Excel-style CEIL() function
+// Rounds x up to nearest multiple of f
+inline size_t ceiling(size_t x, size_t f)
+{
+    return multiple(x, f) * f;
+}
+
+extern "C"
+size_t maxBlocks(cudaFuncAttributes &attribs, 
+                 cudaDeviceProp &devprop, 
+                 size_t bytesDynamicSharedMem,
+                 size_t threadsPerBlock)
+{
+    
+    // Determine the maximum number of CTAs that can be run simultaneously for each kernel
+    // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
+    const unsigned int regAllocationUnit = (devprop.major < 2 && devprop.minor < 2) ? 256 : 512; // in registers
+    const unsigned int warpAllocationMultiple = 2;
+    const unsigned int smemAllocationUnit = 512;                                                 // in bytes
+    const unsigned int maxThreadsPerSM = (devprop.major < 2 && devprop.minor < 2) ? 768 : 1024;  // sm_12 GPUs increase threads/SM to 1024
+    const unsigned int maxBlocksPerSM = 8;
+
+    // Number of warps (round up to nearest whole multiple of warp size)
+    size_t numWarps = multiple(threadsPerBlock, devprop.warpSize);
+    // Round up to warp allocation multiple
+    numWarps = ceiling(numWarps, warpAllocationMultiple);
+
+    // Number of regs is regs per thread times number of warps times warp size
+    size_t regsPerCTA = attribs.numRegs * devprop.warpSize * numWarps;
+    // Round up to multiple of register allocation unit size
+    regsPerCTA = ceiling(regsPerCTA, regAllocationUnit);
+    
+    size_t smemBytes  = attribs.sharedSizeBytes + bytesDynamicSharedMem;
+    size_t smemPerCTA = ceiling(smemBytes, smemAllocationUnit);
+
+    size_t ctaLimitRegs    = regsPerCTA > 0 ? devprop.regsPerBlock      / regsPerCTA : maxBlocksPerSM;
+    size_t ctaLimitSMem    = smemPerCTA > 0 ? devprop.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
+    size_t ctaLimitThreads =                  maxThreadsPerSM           / threadsPerBlock;
+
+    return devprop.multiProcessorCount * min(ctaLimitRegs, min(ctaLimitSMem, min(ctaLimitThreads, maxBlocksPerSM)));
+}
+
+extern "C"
+size_t maxBlocksFromPointer(void*  kernel, 
+                            size_t bytesDynamicSharedMem,
+                            size_t threadsPerBlock)
+{
+    cudaDeviceProp devprop;
+    int deviceID = -1;
+    cudaError_t err = cudaGetDevice(&deviceID);
+    if (err == cudaSuccess)
+    {
+        err = cudaGetDeviceProperties(&devprop, deviceID);
+        if (err != cudaSuccess)
+            return -1;
+
+        cudaFuncAttributes attr;
+        err = cudaFuncGetAttributes(&attr, (const char*)kernel);
+        if (err != cudaSuccess)
+            return -1;
+
+        return maxBlocks(attr, devprop, bytesDynamicSharedMem, threadsPerBlock);
+    }
+
+    return -1;    
+}
--- a/lib/gpu/cudpp_mini/cudpp_maximal_launch.h
+++ b/lib/gpu/cudpp_mini/cudpp_maximal_launch.h
@ -0,0 +1,37 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#ifndef _MAXIMAL_LAUNCH_H_
+#define _MAXIMAL_LAUNCH_H_
+
+#include "cuda_runtime.h"
+
+extern "C"
+size_t maxBlocks(cudaFuncAttributes &attribs, 
+                 cudaDeviceProp &devprop, 
+                 size_t bytesDynamicSharedMem,
+                 size_t threadsPerBlock);
+
+extern "C"
+size_t maxBlocksFromPointer(void* kernel, 
+                            size_t   bytesDynamicSharedMem,
+                            size_t   threadsPerBlock);
+
+#ifdef __cplusplus
+
+template <typename T>
+size_t maxBlocks(T   kernel, 
+                 size_t bytesDynamicSharedMem,
+                 size_t threadsPerBlock)
+{
+    return maxBlocksFromPointer((void*)kernel, bytesDynamicSharedMem, threadsPerBlock);
+}
+#endif
+
+#endif // _MAXIMAL_LAUNCH_H_
--- a/lib/gpu/cudpp_mini/cudpp_plan.cpp
+++ b/lib/gpu/cudpp_mini/cudpp_plan.cpp
@ -0,0 +1,459 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision: 3572$
+// $Date: 2007-11-19 13:58:06 +0000 (Mon, 19 Nov 2007) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+#include "cudpp.h"
+#include "cudpp_plan_manager.h"
+#include "cudpp_scan.h"
+//#include "cudpp_segscan.h"
+//#include "cudpp_compact.h"
+//#include "cudpp_spmvmult.h"
+#include "cudpp_radixsort.h"
+
+#include <assert.h>
+
+CUDPPPlanManager* CUDPPPlanManager::m_instance = NULL;
+
+CUDPPResult validateOptions(CUDPPConfiguration config, size_t /*numElements*/, size_t numRows, size_t /*rowPitch*/)
+{
+    CUDPPResult ret = CUDPP_SUCCESS;
+    if ((config.options & CUDPP_OPTION_BACKWARD) && (config.options & CUDPP_OPTION_FORWARD))
+        ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
+    if ((config.options & CUDPP_OPTION_EXCLUSIVE) && (config.options & CUDPP_OPTION_INCLUSIVE))
+        ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
+
+    if (config.algorithm == CUDPP_COMPACT && numRows > 1)
+        ret = CUDPP_ERROR_ILLEGAL_CONFIGURATION; //!< @todo: add support for multi-row cudppCompact
+
+    return ret;
+}
+
+/** @addtogroup publicInterface
+  * @{
+  */
+
+/** @name Plan Interface
+ * @{
+ */
+
+
+/** @brief Create a CUDPP plan 
+  * 
+  * A plan is a data structure containing state and intermediate storage space
+  * that CUDPP uses to execute algorithms on data.  A plan is created by 
+  * passing to cudppPlan() a CUDPPConfiguration that specifies the algorithm,
+  * operator, datatype, and options.  The size of the data must also be passed
+  * to cudppPlan(), in the \a numElements, \a numRows, and \a rowPitch 
+  * arguments.  These sizes are used to allocate internal storage space at the
+  * time the plan is created.  The CUDPP planner may use the sizes, options,
+  * and information about the present hardware to choose optimal settings.
+  *
+  * Note that \a numElements is the maximum size of the array to be processed
+  * with this plan.  That means that a plan may be re-used to process (for 
+  * example, to sort or scan) smaller arrays.  
+  * 
+  * @param[out] planHandle A pointer to an opaque handle to the internal plan
+  * @param[in]  config The configuration struct specifying algorithm and options
+  * @param[in]  numElements The maximum number of elements to be processed
+  * @param[in]  numRows The number of rows (for 2D operations) to be processed
+  * @param[in]  rowPitch The pitch of the rows of input data, in elements
+  */
+CUDPP_DLL
+CUDPPResult cudppPlan(CUDPPHandle        *planHandle, 
+                      CUDPPConfiguration config, 
+                      size_t             numElements, 
+                      size_t             numRows, 
+                      size_t             rowPitch)
+{
+    CUDPPResult result = CUDPP_SUCCESS;
+
+    CUDPPPlan *plan;
+
+    result = validateOptions(config, numElements, numRows, rowPitch);
+    if (result != CUDPP_SUCCESS)
+    {
+        *planHandle = CUDPP_INVALID_HANDLE;
+        return result;
+    }
+
+    switch (config.algorithm)
+    {
+    case CUDPP_SCAN:
+        {
+            plan = new CUDPPScanPlan(config, numElements, numRows, rowPitch);
+            break;
+        }
+//    case CUDPP_COMPACT:
+//        {
+//            plan = new CUDPPCompactPlan(config, numElements, numRows, rowPitch);
+//            break;
+//        }
+    case CUDPP_SORT_RADIX:
+    //case CUDPP_SORT_RADIX_GLOBAL:
+        {
+            plan = new CUDPPRadixSortPlan(config, numElements);
+            break;
+        }
+/*    case CUDPP_SEGMENTED_SCAN:
+        {
+            plan = new CUDPPSegmentedScanPlan(config, numElements);
+            break;
+        }
+    //new rand plan
+    case CUDPP_RAND_MD5:
+        {
+            plan = new CUDPPRandPlan(config, numElements);
+            break;
+        }
+    case CUDPP_REDUCE:*/
+    default:
+        //! @todo: implement cudppReduce()
+        return CUDPP_ERROR_ILLEGAL_CONFIGURATION; 
+        break;
+    }
+
+    *planHandle = CUDPPPlanManager::AddPlan(plan);
+    if (CUDPP_INVALID_HANDLE == *planHandle)
+        return CUDPP_ERROR_UNKNOWN;
+    else
+        return CUDPP_SUCCESS;
+}
+
+/** @brief Destroy a CUDPP Plan
+  *
+  * Deletes the plan referred to by \a planHandle and all associated internal
+  * storage.
+  * 
+  * @param[in] planHandle The CUDPPHandle to the plan to be destroyed
+  */
+CUDPP_DLL
+CUDPPResult cudppDestroyPlan(CUDPPHandle planHandle)
+{
+    if (CUDPPPlanManager::RemovePlan(planHandle) == false)
+        return CUDPP_ERROR_INVALID_HANDLE;
+    else
+        return CUDPP_SUCCESS;
+}
+
+/** @brief Create a CUDPP Sparse Matrix Object 
+  *
+  * The sparse matrix plan is a data structure containing state and intermediate storage space
+  * that CUDPP uses to perform sparse matrix dense vector multiply.  This plan is created by 
+  * passing to CUDPPSparseMatrixVectorMultiplyPlan() a CUDPPConfiguration that specifies the 
+  * algorithm (sprarse matrix-dense vector multiply) and datatype, along with the sparse matrix
+  * itself in CSR format.  The number of non-zero elements in the sparse matrix must also be passed
+  * as \a numNonZeroElements. This is used to allocate internal storage space at the time the 
+  * sparse matrix plan is created.
+  *
+  * @param[out] sparseMatrixHandle A pointer to an opaque handle to the sparse matrix object
+  * @param[in]  config The configuration struct specifying algorithm and options
+  * @param[in]  numNonZeroElements The number of non zero elements in the sparse matrix 
+  * @param[in]  numRows This is the number of rows in y, x and A for y = A * x
+  * @param[in]  A The matrix data
+  * @param[in]  h_rowIndices An array containing the index of the start of each row in \a A
+  * @param[in]  h_indices An array containing the index of each nonzero element in \a A
+ 
+CUDPP_DLL
+CUDPPResult cudppSparseMatrix(CUDPPHandle        *sparseMatrixHandle, 
+                              CUDPPConfiguration config, 
+                              size_t             numNonZeroElements, 
+                              size_t             numRows, 
+                              const void         *A,
+                              const unsigned int *h_rowIndices,
+                              const unsigned int *h_indices)
+{
+    CUDPPResult result = CUDPP_SUCCESS;
+
+    CUDPPPlan *sparseMatrix;
+
+    if ((config.algorithm != CUDPP_SPMVMULT) || 
+        (numNonZeroElements <= 0) || (numRows <= 0))
+    {
+        result = CUDPP_ERROR_ILLEGAL_CONFIGURATION;
+    }
+
+    if (result != CUDPP_SUCCESS)
+    {
+        *sparseMatrixHandle = CUDPP_INVALID_HANDLE;
+        return result;
+    }
+
+    sparseMatrix = 
+        new CUDPPSparseMatrixVectorMultiplyPlan(config, numNonZeroElements, A, 
+                                                h_rowIndices, h_indices, numRows);
+
+    *sparseMatrixHandle = CUDPPPlanManager::AddPlan(sparseMatrix);
+    if (CUDPP_INVALID_HANDLE == *sparseMatrixHandle)
+        return CUDPP_ERROR_UNKNOWN;
+    else
+        return CUDPP_SUCCESS;
+}
+*/
+/** @brief Destroy a CUDPP Sparse Matrix Object
+  *
+  * Deletes the sparse matrix data and plan referred to by \a sparseMatrixHandle 
+  * and all associated internal storage.
+  * 
+  * @param[in] sparseMatrixHandle The CUDPPHandle to the matrix object to be destroyed
+  
+CUDPP_DLL
+CUDPPResult cudppDestroySparseMatrix(CUDPPHandle sparseMatrixHandle)
+{
+    return cudppDestroyPlan(sparseMatrixHandle);
+}
+*/
+/** @} */ // end Plan Interface
+/** @} */ // end publicInterface
+
+
+/** @brief Plan base class constructor
+  * 
+  * @param[in]  config The configuration struct specifying algorithm and options
+  * @param[in]  numElements The maximum number of elements to be processed
+  * @param[in]  numRows The number of rows (for 2D operations) to be processed
+  * @param[in]  rowPitch The pitch of the rows of input data, in elements
+  */
+CUDPPPlan::CUDPPPlan(CUDPPConfiguration config, 
+                     size_t numElements, 
+                     size_t numRows, 
+                     size_t rowPitch)
+: m_config(config),
+  m_numElements(numElements),
+  m_numRows(numRows),
+  m_rowPitch(rowPitch)
+{
+}
+
+/** @brief Scan Plan constructor
+* 
+* @param[in]  config The configuration struct specifying algorithm and options
+* @param[in]  numElements The maximum number of elements to be scanned
+* @param[in]  numRows The maximum number of rows (for 2D operations) to be scanned
+* @param[in]  rowPitch The pitch of the rows of input data, in elements
+*/
+CUDPPScanPlan::CUDPPScanPlan(CUDPPConfiguration config, 
+                             size_t numElements, 
+                             size_t numRows, 
+                             size_t rowPitch)
+: CUDPPPlan(config, numElements, numRows, rowPitch),
+  m_blockSums(0),
+  m_rowPitches(0),
+  m_numEltsAllocated(0),
+  m_numRowsAllocated(0),
+  m_numLevelsAllocated(0)
+{
+    allocScanStorage(this);
+}
+
+/** @brief CUDPP scan plan destructor */
+CUDPPScanPlan::~CUDPPScanPlan()
+{
+    freeScanStorage(this);
+}
+
+/** @brief SegmentedScan Plan constructor
+* 
+* @param[in]  config The configuration struct specifying options
+* @param[in]  numElements The maximum number of elements to be scanned
+
+CUDPPSegmentedScanPlan::CUDPPSegmentedScanPlan(CUDPPConfiguration config, 
+                                               size_t numElements)
+: CUDPPPlan(config, numElements, 1, 0),
+  m_blockSums(0),
+  m_blockFlags(0),
+  m_blockIndices(0),
+  m_numEltsAllocated(0),
+  m_numLevelsAllocated(0)
+{
+    allocSegmentedScanStorage(this);
+}
+*/
+/** @brief SegmentedScan plan destructor 
+CUDPPSegmentedScanPlan::~CUDPPSegmentedScanPlan()
+{
+    freeSegmentedScanStorage(this);
+}
+*/
+/** @brief Compact Plan constructor
+* 
+* @param[in]  config The configuration struct specifying options
+* @param[in]  numElements The maximum number of elements to be compacted
+* @param[in]  numRows The number of rows (for 2D operations) to be compacted
+* @param[in]  rowPitch The pitch of the rows of input data, in elements
+
+CUDPPCompactPlan::CUDPPCompactPlan(CUDPPConfiguration config, 
+                                   size_t numElements, 
+                                   size_t numRows, 
+                                   size_t rowPitch)
+: CUDPPPlan(config, numElements, numRows, rowPitch),
+  m_d_outputIndices(0)
+{
+    assert(numRows == 1); //!< @todo Add support for multirow compaction
+
+    CUDPPConfiguration scanConfig = 
+    { 
+      CUDPP_SCAN, 
+      CUDPP_ADD, 
+      CUDPP_UINT, 
+      (config.options & CUDPP_OPTION_BACKWARD) ? 
+        CUDPP_OPTION_BACKWARD | CUDPP_OPTION_EXCLUSIVE : 
+        CUDPP_OPTION_FORWARD  | CUDPP_OPTION_EXCLUSIVE 
+    };
+    m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, numRows, rowPitch);
+
+    allocCompactStorage(this);
+}
+*/
+/** @brief Compact plan destructor 
+CUDPPCompactPlan::~CUDPPCompactPlan()
+{
+    delete m_scanPlan;
+    freeCompactStorage(this);
+}
+*/
+/** @brief Sort Plan constructor
+* 
+* @param[in]  config The configuration struct specifying algorithm and options
+* @param[in]  numElements The maximum number of elements to be sorted
+*/
+/*CUDPPSortPlan::CUDPPSortPlan(CUDPPConfiguration config, size_t numElements)
+: CUDPPPlan(config, numElements, 1, 0),
+  m_scanPlan(0),
+  m_d_temp(0),
+  m_d_tempAddress(0)
+{
+    CUDPPConfiguration scanConfig = 
+    { 
+      CUDPP_SCAN, 
+      CUDPP_ADD, 
+      CUDPP_UINT, 
+      CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE 
+    };
+
+    //if (config.algorithm == CUDPP_SORT_RADIX_GLOBAL)
+    {
+        m_scanPlan = new CUDPPScanPlan(scanConfig, numElements, 1, 0);
+    }
+
+    allocSortStorage(this);
+}*/
+
+/** @brief Sort plan destructor */
+/*CUDPPSortPlan::~CUDPPSortPlan()
+{
+    delete m_scanPlan;
+    freeSortStorage(this);
+}*/
+
+CUDPPRadixSortPlan::CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements)
+: CUDPPPlan(config, numElements, 1, 0),
+  m_scanPlan(0),
+  m_tempKeys(0),    
+  m_tempValues(0),
+  m_counters(0),
+  m_countersSum(0),
+  m_blockOffsets(0) 
+{
+    size_t numBlocks2 = ((numElements % (SORT_CTA_SIZE * 2)) == 0) ?
+            (numElements / (SORT_CTA_SIZE * 2)) : (numElements / (SORT_CTA_SIZE * 2) + 1);
+
+    CUDPPConfiguration scanConfig = 
+    { 
+      CUDPP_SCAN, 
+      CUDPP_ADD, 
+      CUDPP_UINT, 
+      CUDPP_OPTION_FORWARD | CUDPP_OPTION_EXCLUSIVE 
+    };    
+
+    if(m_config.options == CUDPP_OPTION_KEYS_ONLY)
+        m_bKeysOnly = true;
+    else
+        m_bKeysOnly = false;
+
+    m_scanPlan = new CUDPPScanPlan(scanConfig, numBlocks2*16, 1, 0);    
+        
+    allocRadixSortStorage(this); 
+}
+
+CUDPPRadixSortPlan::~CUDPPRadixSortPlan()
+{
+    delete m_scanPlan;
+    freeRadixSortStorage(this);
+}
+
+/** @brief SparseMatrixVectorMultiply Plan constructor
+* 
+* @param[in]  config The configuration struct specifying options
+* @param[in]  numNonZeroElements The number of non-zero elements in sparse matrix
+* @param[in]  A Array of non-zero matrix elements
+* @param[in]  rowIndex Array of indices of the first element of each row 
+*                     in the "flattened" version of the sparse matrix
+* @param[in]  index Array of indices of non-zero elements in the matrix
+* @param[in]  numRows The number of rows in the sparse matrix
+
+CUDPPSparseMatrixVectorMultiplyPlan::CUDPPSparseMatrixVectorMultiplyPlan(
+                                                                         CUDPPConfiguration config,
+                                                                         size_t             numNonZeroElements,
+                                                                         const void         *A,
+                                                                         const unsigned int *rowIndex,
+                                                                         const unsigned int *index,
+                                                                         size_t             numRows
+                                                                         )
+: CUDPPPlan(config, numNonZeroElements, 1, 0),
+  m_segmentedScanPlan(0),
+  m_d_prod(0),
+  m_d_flags(0),
+  m_d_rowFinalIndex(0),
+  m_rowFinalIndex(0),
+  m_numRows(numRows),
+  m_numNonZeroElements(numNonZeroElements)  
+{
+    CUDPPConfiguration segScanConfig = 
+    { 
+      CUDPP_SEGMENTED_SCAN, 
+      CUDPP_ADD, 
+      config.datatype, 
+      (CUDPP_OPTION_FORWARD | CUDPP_OPTION_INCLUSIVE) 
+    };
+    m_segmentedScanPlan = new CUDPPSegmentedScanPlan(segScanConfig, m_numNonZeroElements);
+
+    // Generate an array of the indices of the last element of each row
+    // in the "flattened" version of the sparse matrix
+    m_rowFinalIndex = new unsigned int [m_numRows];
+    for (unsigned int i=0; i < m_numRows; ++i)
+    {
+        if (i < m_numRows-1)
+            m_rowFinalIndex[i] = rowIndex[i+1];
+        else
+            m_rowFinalIndex[i] = (unsigned int)numNonZeroElements;
+    }
+
+    allocSparseMatrixVectorMultiplyStorage(this, A, rowIndex, index);
+}
+*/
+/** @brief Sparse matrix-vector plan destructor 
+CUDPPSparseMatrixVectorMultiplyPlan::~CUDPPSparseMatrixVectorMultiplyPlan()
+{
+    freeSparseMatrixVectorMultiplyStorage(this);
+    delete m_segmentedScanPlan;
+    delete [] m_rowFinalIndex;
+}
+*/
+/** @brief CUDPP Rand Plan Constructor
+  * @param[in] config The configuration struct specifying options
+  * @param[in] num_elements The number of elements to generate random bits for
+  
+CUDPPRandPlan::CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements) 
+ : CUDPPPlan(config, num_elements, 1, 0),
+   m_seed(0)
+{
+    
+}
+*/
+
--- a/lib/gpu/cudpp_mini/cudpp_plan.h
+++ b/lib/gpu/cudpp_mini/cudpp_plan.h
@ -0,0 +1,158 @@
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision: 3572$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#ifndef __CUDPP_PLAN_H__
+#define __CUDPP_PLAN_H__
+
+typedef void* KernelPointer;
+
+extern "C" size_t getNumCTAs(KernelPointer kernel);
+extern "C" void   compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock);
+
+template <typename T>
+size_t numCTAs(T kernel)
+{
+    return getNumCTAs((KernelPointer)kernel);
+}
+
+template <typename T>
+void computeNumCTAs(T kernel, unsigned int bytesDynamicSharedMem, size_t threadsPerBlock)
+{
+    compNumCTAs((KernelPointer)kernel, bytesDynamicSharedMem, threadsPerBlock);
+}
+
+/** @brief Base class for CUDPP Plan data structures
+  *
+  * CUDPPPlan and its subclasses provide the internal (i.e. not visible to the
+  * library user) infrastructure for planning algorithm execution.  They 
+  * own intermediate storage for CUDPP algorithms as well as, in some cases,
+  * information about optimal execution configuration for the present hardware.
+  * 
+  */
+class CUDPPPlan
+{
+public:
+    CUDPPPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
+    virtual ~CUDPPPlan() {}
+
+    // Note anything passed to functions compiled by NVCC must be public
+    CUDPPConfiguration m_config;        //!< @internal Options structure
+    size_t             m_numElements;   //!< @internal Maximum number of input elements
+    size_t             m_numRows;       //!< @internal Maximum number of input rows
+    size_t             m_rowPitch;      //!< @internal Pitch of input rows in elements
+};
+
+/** @brief Plan class for scan algorithm
+  *
+  */
+class CUDPPScanPlan : public CUDPPPlan
+{
+public:
+    CUDPPScanPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
+    virtual ~CUDPPScanPlan();
+
+    void  **m_blockSums;          //!< @internal Intermediate block sums array
+    size_t *m_rowPitches;         //!< @internal Pitch of each row in elements (for cudppMultiScan())
+    size_t  m_numEltsAllocated;   //!< @internal Number of elements allocated (maximum scan size)
+    size_t  m_numRowsAllocated;   //!< @internal Number of rows allocated (for cudppMultiScan())
+    size_t  m_numLevelsAllocated; //!< @internal Number of levels allocaed (in _scanBlockSums)
+};
+
+/** @brief Plan class for segmented scan algorithm
+*
+*/
+class CUDPPSegmentedScanPlan : public CUDPPPlan
+{
+public:
+    CUDPPSegmentedScanPlan(CUDPPConfiguration config, size_t numElements);
+    virtual ~CUDPPSegmentedScanPlan();
+
+    void          **m_blockSums;          //!< @internal Intermediate block sums array
+    unsigned int  **m_blockFlags;         //!< @internal Intermediate block flags array
+    unsigned int  **m_blockIndices;       //!< @internal Intermediate block indices array
+    size_t        m_numEltsAllocated;     //!< @internal Number of elements allocated (maximum scan size)
+    size_t        m_numLevelsAllocated;   //!< @internal Number of levels allocaed (in _scanBlockSums)
+};
+
+/** @brief Plan class for compact algorithm
+*
+*/
+class CUDPPCompactPlan : public CUDPPPlan
+{
+public:
+    CUDPPCompactPlan(CUDPPConfiguration config, size_t numElements, size_t numRows, size_t rowPitch);
+    virtual ~CUDPPCompactPlan();
+
+    CUDPPScanPlan *m_scanPlan;         //!< @internal Compact performs a scan of type unsigned int using this plan
+    unsigned int* m_d_outputIndices; //!< @internal Output address of compacted elements; this is the result of scan
+    
+};
+
+class CUDPPRadixSortPlan : public CUDPPPlan
+{
+public:
+    CUDPPRadixSortPlan(CUDPPConfiguration config, size_t numElements);
+    virtual ~CUDPPRadixSortPlan();
+	
+    bool           m_bKeysOnly;
+    bool           m_bManualCoalesce;
+    bool           m_bUsePersistentCTAs;
+    unsigned int   m_persistentCTAThreshold[2];
+    unsigned int   m_persistentCTAThresholdFullBlocks[2];
+    CUDPPScanPlan *m_scanPlan;        //!< @internal Sort performs a scan of type unsigned int using this plan
+    unsigned int   m_keyBits;
+    mutable void  *m_tempKeys;        //!< @internal Intermediate storage for keys
+    mutable void  *m_tempValues;      //!< @internal Intermediate storage for values
+    unsigned int  *m_counters;        //!< @internal Counter for each radix
+    unsigned int  *m_countersSum;     //!< @internal Prefix sum of radix counters
+    unsigned int  *m_blockOffsets;    //!< @internal Global offsets of each radix in each block
+
+};
+
+/** @brief Plan class for sparse-matrix dense-vector multiply
+*
+*/
+class CUDPPSparseMatrixVectorMultiplyPlan : public CUDPPPlan
+{
+public:
+    CUDPPSparseMatrixVectorMultiplyPlan(CUDPPConfiguration config, size_t numNZElts,
+                                        const void         *A,
+                                        const unsigned int *rowindx, 
+                                        const unsigned int *indx, size_t numRows);
+    virtual ~CUDPPSparseMatrixVectorMultiplyPlan();
+
+    CUDPPSegmentedScanPlan *m_segmentedScanPlan; //!< @internal Performs a segmented scan of type T using this plan
+    void             *m_d_prod;  //!< @internal Vector of products (of an element in A and its corresponding (thats is
+                                 //!            belongs to the same row) element in x; this is the input and output of 
+                                 //!            segmented scan
+    unsigned int     *m_d_flags; //!< @internal Vector of flags where a flag is set if an element of A is the first element
+                                 //!            of its row; this is the flags vector for segmented scan
+    unsigned int     *m_d_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
+                                         //!            which is the last element of that row. Resides in GPU memory. 
+    unsigned int     *m_d_rowIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
+                                    //!            which is the first element of that row. Resides in GPU memory. 
+    unsigned int     *m_d_index;    //!<@internal Vector of column numbers one for each element in A 
+    void             *m_d_A;        //!<@internal The A matrix 
+    unsigned int     *m_rowFinalIndex; //!< @internal Vector of row end indices, which for each row specifies an index in A
+                                       //!            which is the last element of that row. Resides in CPU memory.
+    size_t           m_numRows; //!< Number of rows
+    size_t           m_numNonZeroElements; //!<Number of non-zero elements
+};
+
+/** @brief Plan class for random number generator
+*
+*/
+class CUDPPRandPlan : public CUDPPPlan
+{
+public:
+    CUDPPRandPlan(CUDPPConfiguration config, size_t num_elements);
+
+    unsigned int m_seed; //!< @internal the seed for the random number generator
+};
+#endif // __CUDPP_PLAN_H__
--- a/lib/gpu/cudpp_mini/cudpp_plan_manager.cpp
+++ b/lib/gpu/cudpp_mini/cudpp_plan_manager.cpp
@ -0,0 +1,155 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision: 3572$
+// $Date: 2007-11-19 13:58:06 +0000 (Mon, 19 Nov 2007) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#include "cudpp.h"
+#include "cudpp_plan.h"
+#include "cudpp_plan_manager.h"
+#include "cudpp_maximal_launch.h"
+
+typedef void* KernelPointer;
+
+extern "C" size_t getNumCTAs(KernelPointer kernel)
+{
+    return CUDPPPlanManager::numCTAs(kernel);    
+}
+extern "C" void compNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock)
+{
+    CUDPPPlanManager::computeNumCTAs(kernel, bytesDynamicSharedMem, threadsPerBlock);
+}
+
+//! @internal Instantiate the plan manager singleton object
+void CUDPPPlanManager::Instantiate() 
+{ 
+    if (NULL == m_instance) 
+        m_instance = new CUDPPPlanManager; 
+}
+
+//! @internal Destroy the plan manager singleton object
+void CUDPPPlanManager::Destroy()     
+{ 
+    if (NULL != m_instance) 
+    { 
+        delete m_instance; 
+        m_instance = NULL; 
+    } 
+}
+
+/** @brief Plan Manager destructor 
+* Destroys all plans as well as the plan manager.
+*/
+CUDPPPlanManager::~CUDPPPlanManager()
+{
+    std::map<CUDPPHandle,CUDPPPlan*>::iterator it;
+
+    for (it = m_instance->plans.begin(); it != m_instance->plans.end(); it++)
+    {
+        CUDPPPlan* plan = it->second;
+        delete plan;
+        plan = NULL;
+    }
+    m_instance->plans.clear();
+
+    m_instance->numCTAsTable.clear();
+}
+
+/** @brief Add a plan to the plan manager
+* 
+* @returns a valid CUDPPHandle if the plan was successfully added, or 
+* CUDPP_INVALID_HANDLE otherwise
+* @param[in] plan The plan to add
+*/
+CUDPPHandle CUDPPPlanManager::AddPlan(CUDPPPlan* plan)
+{
+    Instantiate();
+
+    std::pair<std::map<CUDPPHandle, CUDPPPlan*>::iterator, bool> ret;
+
+    CUDPPHandle handle = (CUDPPHandle)m_instance->plans.size();
+    ret = m_instance->plans.insert(std::pair<CUDPPHandle,CUDPPPlan*>(handle, plan));
+    if (ret.second == true)
+        return handle;
+    else
+        return CUDPP_INVALID_HANDLE;   
+}
+
+/** @brief Remove a plan from the plan manager
+* 
+* @returns true if the plan was successfully removed, false otherwise
+* @param[in] handle The handle to the plan to remove
+*/
+bool CUDPPPlanManager::RemovePlan(CUDPPHandle handle)
+{
+    if (m_instance == NULL)
+    {
+        return false;
+    }
+
+    std::map<CUDPPHandle,CUDPPPlan*>::iterator it;
+    it = m_instance->plans.find(handle);
+
+    if (it != m_instance->plans.end())
+    {
+        CUDPPPlan* plan = it->second;
+        delete plan;
+        plan = NULL;
+        m_instance->plans.erase(it);
+
+        if (0 == m_instance->plans.size())
+        {
+            Destroy();
+        }
+
+        return true;
+    }   
+    else
+    {
+        return false;
+    }   
+}
+
+/** @brief Get a plan from the plan manager by handle
+* 
+* @returns A pointer to the plan if found, or NULL otherwise
+* @param handle The handle to the requested plan
+*/
+CUDPPPlan* CUDPPPlanManager::GetPlan(CUDPPHandle handle)
+{
+    if (m_instance == NULL)
+    {
+        return NULL;
+    }
+
+    std::map<CUDPPHandle, CUDPPPlan*>::iterator it;
+    it = m_instance->plans.find(handle);
+    if (it != m_instance->plans.end())
+    {
+        return it->second;
+    }
+    else
+    {
+        return NULL;
+    }
+}
+
+size_t CUDPPPlanManager::numCTAs(KernelPointer kernel)
+{
+    if (m_instance == NULL)
+    {
+        return 0;
+    }
+
+    return m_instance->numCTAsTable[kernel];
+}
+
+void CUDPPPlanManager::computeNumCTAs(KernelPointer kernel, size_t bytesDynamicSharedMem, size_t threadsPerBlock)
+{
+    Instantiate();
+
+    m_instance->numCTAsTable[kernel] = maxBlocks(kernel, bytesDynamicSharedMem, threadsPerBlock);
+}
--- a/lib/gpu/cudpp_mini/cudpp_plan_manager.h
+++ b/lib/gpu/cudpp_mini/cudpp_plan_manager.h
@ -0,0 +1,56 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision: 3572$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#ifndef __CUDPP_PLAN_MANAGER_H__
+#define __CUDPP_PLAN_MANAGER_H__
+
+#include <map>
+
+class CUDPPPlan;
+typedef void* KernelPointer;
+
+/** @brief Singleton manager class for CUDPPPlan objects
+  * 
+  * This class manages all active plans in CUDPP.  It is a singleton class,
+  * meaning that only one instance can exist.  It is created automatically the
+  * first time AddPlan() is called, and destroyed when the last plan is removed
+  * using RemovePlan().
+  */
+class CUDPPPlanManager
+{
+public:
+    static CUDPPHandle AddPlan(CUDPPPlan* plan);
+    static bool        RemovePlan(CUDPPHandle handle);
+    static CUDPPPlan*  GetPlan(CUDPPHandle handle);
+    
+    static size_t      numCTAs(KernelPointer kernel);
+    static void        computeNumCTAs(KernelPointer kernel, 
+                                      size_t bytesDynamicSharedMem, 
+                                      size_t threadsPerBlock);
+    
+protected:
+    static CUDPPPlanManager* m_instance;
+    std::map<CUDPPHandle, CUDPPPlan*> plans;
+    std::map<void*, size_t> numCTAsTable;
+
+private:
+    
+
+    //! @internal Instantiate the plan manager singleton object
+    static void Instantiate();
+    //! @internal Destroy the plan manager singleton object
+    static void Destroy();
+
+private:
+    CUDPPPlanManager() {}
+    CUDPPPlanManager(const CUDPPPlanManager&) {}
+    ~CUDPPPlanManager();
+};
+
+#endif // __CUDPP_PLAN_MANAGER_H__
--- a/lib/gpu/cudpp_mini/cudpp_radixsort.h
+++ b/lib/gpu/cudpp_mini/cudpp_radixsort.h
@ -0,0 +1,34 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+#ifndef   __RADIXSORT_H__
+#define   __RADIXSORT_H__
+
+#define SORT_CTA_SIZE 256 //This CTA_SIZE must equal 16 * number of radices 
+
+#include "cudpp_globals.h"
+#include "cudpp.h"
+#include "cudpp_plan.h"
+
+
+extern "C"
+void allocRadixSortStorage(CUDPPRadixSortPlan* plan);
+
+extern "C"
+void freeRadixSortStorage(CUDPPRadixSortPlan* plan);
+
+extern "C"
+void cudppRadixSortDispatch(void    *keys,
+                            void    *values,
+                            size_t  numElements,
+                            int     keyBits,
+                            const   CUDPPRadixSortPlan *plan);
+
+
+#endif // __RADIXSORT_H__
--- a/lib/gpu/cudpp_mini/cudpp_scan.h
+++ b/lib/gpu/cudpp_mini/cudpp_scan.h
@ -0,0 +1,36 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+* @file
+* cudpp_scan.h
+*
+* @brief Scan functionality header file - contains CUDPP interface (not public)
+*/
+
+#ifndef _CUDPP_SCAN_H_
+#define _CUDPP_SCAN_H_
+
+class CUDPPScanPlan;
+
+extern "C"
+void allocScanStorage(CUDPPScanPlan *plan);
+
+extern "C"
+void freeScanStorage(CUDPPScanPlan *plan);
+
+extern "C"
+void cudppScanDispatch(void                *d_out, 
+                       const void          *d_in, 
+                       size_t              numElements,
+                       size_t              numRows,
+                       const CUDPPScanPlan *plan);
+
+#endif // _CUDPP_SCAN_H_
--- a/lib/gpu/cudpp_mini/cudpp_util.h
+++ b/lib/gpu/cudpp_mini/cudpp_util.h
@ -0,0 +1,363 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * cudpp_util.h
+ *
+ * @brief C++ utility functions and classes used internally to cuDPP
+ */
+
+#ifndef __CUDPP_UTIL_H__
+#define __CUDPP_UTIL_H__
+
+#ifdef WIN32
+#include <windows.h>
+#endif
+
+#include <cuda.h>
+#include <cudpp.h>
+#include <limits.h>
+#include <float.h>
+
+#if (CUDA_VERSION >= 3000)
+#define LAUNCH_BOUNDS(x) __launch_bounds__((x))
+#define LAUNCH_BOUNDS_MINBLOCKs(x, y) __launch_bounds__((x),(y))
+#else
+#define LAUNCH_BOUNDS(x)
+#define LAUNCH_BOUNDS_MINBLOCKS(x, y)
+#endif
+
+
+/** @brief Determine if \a n is a power of two.
+  * @param n Value to be checked to see if it is a power of two
+  * @returns True if \a n is a power of two, false otherwise
+  */
+inline bool 
+isPowerOfTwo(int n)
+{
+    return ((n&(n-1))==0) ;
+}
+
+/** @brief Determine if an integer \a n is a multiple of an integer \a f.
+  * @param n Multiple
+  * @param f Factor
+  * @returns True if \a n is a multiple of \a f, false otherwise
+  */
+inline bool
+isMultiple(int n, int f)
+{
+    if (isPowerOfTwo(f))
+        return ((n&(f-1))==0);
+    else
+        return (n%f==0);
+}
+
+/** @brief Compute the smallest power of two larger than \a n.
+  * @param n Input value
+  * @returns The smallest power f two larger than \a n
+  */
+inline int 
+ceilPow2(int n) 
+{
+        double log2n = log2((double)n);
+        if (isPowerOfTwo(n))
+                return n;
+        else
+                return 1 << (int)ceil(log2n);
+}
+
+/** @brief Compute the largest power of two smaller than \a n.
+  * @param n Input value
+  * @returns The largest power of two smaller than \a n.
+  */
+inline int 
+floorPow2(int n)
+{
+#ifdef WIN32
+    // method 2
+    return 1 << (int)_logb((float)n);
+#else
+    // method 3
+    int exp;
+    frexp((float)n, &exp);
+    return 1 << (exp - 1);
+#endif
+}
+
+/** @brief Returns the maximum value for type \a T.
+  * 
+  * Implemented using template specialization on \a T.
+  */
+template <class T> 
+__host__ __device__ inline T getMax() { return 0; }
+/** @brief Returns the minimum value for type \a T.
+* 
+* Implemented using template specialization on \a T.
+*/
+template <class T> 
+__host__ __device__ inline T getMin() { return 0; }
+// type specializations for the above
+// getMax
+template <> __host__ __device__ inline int getMax() { return INT_MAX; }
+template <> __host__ __device__ inline unsigned int getMax() { return INT_MAX; }
+template <> __host__ __device__ inline float getMax() { return FLT_MAX; }
+template <> __host__ __device__ inline char getMax() { return (char)INT_MAX; }
+template <> __host__ __device__ inline unsigned char getMax() { return (unsigned char)INT_MAX; }
+// getMin
+template <> __host__ __device__ inline int getMin() { return INT_MIN; }
+template <> __host__ __device__ inline unsigned int getMin() { return 0; }
+template <> __host__ __device__ inline float getMin() { return -FLT_MAX; }
+template <> __host__ __device__ inline char getMin() { return (char)INT_MIN; }
+template <> __host__ __device__ inline unsigned char getMin() { return (unsigned char)0; }
+
+/** @brief Returns the maximum of three values. 
+  * @param a First value. 
+  * @param b Second value. 
+  * @param c Third value. 
+  * @returns The maximum of \a a, \a b and \a c.
+  */
+template<class T>
+inline int max3(T a, T b, T c)
+{       
+    return (a > b) ? ((a > c)? a : c) : ((b > c) ? b : c);
+}
+
+/** @brief Utility template struct for generating small vector types from scalar types
+  *
+  * Given a base scalar type (\c int, \c float, etc.) and a vector length (1 through 4) as 
+  * template parameters, this struct defines a vector type (\c float3, \c int4, etc.) of the 
+  * specified length and base type.  For example:
+  * \code
+  * template <class T>
+  * __device__ void myKernel(T *data)
+  * {
+  *     typeToVector<T,4>::Result myVec4;             // create a vec4 of type T
+  *     myVec4 = (typeToVector<T,4>::Result*)data[0]; // load first element of data as a vec4
+  * }
+  * \endcode
+  *
+  * This functionality is implemented using template specialization.  Currently specializations
+  * for int, float, and unsigned int vectors of lengths 2-4 are defined.  Note that this results 
+  * in types being generated at compile time -- there is no runtime cost.  typeToVector is used by 
+  * the optimized scan \c __device__ functions in scan_cta.cu.
+  */
+template <typename T, int N>
+struct typeToVector
+{
+    typedef T Result;
+};
+
+template<>
+struct typeToVector<int, 4>
+{
+    typedef int4 Result;
+};
+template<>
+struct typeToVector<unsigned int, 4>
+{
+    typedef uint4 Result;
+};
+template<>
+struct typeToVector<float, 4>
+{
+    typedef float4 Result;
+};
+template<>
+struct typeToVector<int, 3>
+{
+    typedef int3 Result;
+};
+template<>
+struct typeToVector<unsigned int, 3>
+{
+    typedef uint3 Result;
+};
+template<>
+struct typeToVector<float, 3>
+{
+    typedef float3 Result;
+};
+template<>
+struct typeToVector<int, 2>
+{
+    typedef int2 Result;
+};
+template<>
+struct typeToVector<unsigned int, 2>
+{
+    typedef uint2 Result;
+};
+template<>
+struct typeToVector<float, 2>
+{
+    typedef float2 Result;
+};
+
+/** @brief Templatized operator class used by scan and segmented scan
+  * 
+  * This Operator class is used to allow generic support of binary 
+  * associative operators in scan.  It defines two member functions, 
+  * op() and identity(), that are used in place of + and 0 (for 
+  * example) in the scan and  segmented scan code. Because this is 
+  * template code, all decisions in the code are made at compile 
+  * time, resulting in optimal operator code. Currently the operators 
+  * CUDPP_ADD, CUDPP_MULTIPLY, CUDPP_MIN, and CUDPP_MAX are supported. 
+  * Operator is implemented using template specialization for the 
+  * types \c int, \c unsigned int, and \c float.
+  */
+template <typename T, CUDPPOperator oper>
+class Operator
+{
+public:
+    /** Applies the operator to operands \a a and \a b.
+      * @param a First operand
+      * @param b Second operand
+      * @returns a OP b, where OP is defined by ::CUDPPOperator \a oper.
+      */
+    static __device__ T op(const T a, const T b)
+    {
+        switch (oper)
+        {
+        case CUDPP_ADD: 
+            return a + b;
+        case CUDPP_MULTIPLY:
+            return a * b;
+        case CUDPP_MIN:
+            return min(a, b);
+        case CUDPP_MAX: 
+            return max(a, b);
+        }         
+    }
+
+    /** Returns the identity element defined for type \a T */
+    static __device__ T identity() { return 0; }
+};
+
+// specializations for different types
+template <CUDPPOperator oper>
+class Operator <int, oper>
+{
+public:
+    static __device__ int op(const int a, const int b)
+    {
+        switch (oper)
+        {
+        default:
+        case CUDPP_ADD: 
+            return a + b;
+        case CUDPP_MULTIPLY:
+            return a * b;
+        case CUDPP_MIN:
+            return min(a, b);
+        case CUDPP_MAX: 
+            return max(a, b);
+        }         
+    }
+
+    static __device__ int identity()
+    {
+        switch (oper)
+        {
+        default:
+        case CUDPP_ADD:
+            return 0;
+        case CUDPP_MULTIPLY:
+            return 1;
+        case CUDPP_MIN:
+            return INT_MAX;
+        case CUDPP_MAX:
+            return INT_MIN;
+        }
+    }
+};
+
+template <CUDPPOperator oper>
+class Operator <unsigned int, oper>
+{
+public:
+    static __device__ unsigned int op(const unsigned int a, const unsigned int b)
+    {
+        switch (oper)
+        {
+        default:
+        case CUDPP_ADD: 
+            return a + b;
+        case CUDPP_MULTIPLY:
+            return a * b;
+        case CUDPP_MIN:
+            return min(a, b);
+        case CUDPP_MAX: 
+            return max(a, b);
+        }         
+    }
+
+    static __device__ unsigned int identity()
+    {
+        switch (oper)
+        {
+        default:
+        case CUDPP_ADD:
+            return 0;
+        case CUDPP_MULTIPLY:
+            return 1;
+        case CUDPP_MIN:
+            return UINT_MAX;
+        case CUDPP_MAX:
+            return 0;
+        }
+    }
+};
+
+
+template <CUDPPOperator oper>
+class Operator <float, oper>
+{
+public:
+    static __device__ float op(const float a, const float b)
+    {
+        switch (oper)
+        {
+        default:
+        case CUDPP_ADD: 
+            return a + b;
+        case CUDPP_MULTIPLY:
+            return a * b;
+        case CUDPP_MIN:
+            return min(a, b);
+        case CUDPP_MAX: 
+            return max(a, b);
+        }         
+    }
+
+    static __device__ float identity()
+    {
+        switch (oper)
+        {
+        default:
+        case CUDPP_ADD:
+            return 0.0f;
+        case CUDPP_MULTIPLY:
+            return 1.0f;
+        case CUDPP_MIN:
+            return FLT_MAX;
+        case CUDPP_MAX:
+            return -FLT_MAX;
+        }
+    }
+};
+
+#endif // __CUDPP_UTIL_H__
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/lib/gpu/cudpp_mini/cutil.h
+++ b/lib/gpu/cudpp_mini/cutil.h
@ -0,0 +1,879 @@
+/*
+* Copyright 1993-2006 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:   
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and 
+* international Copyright laws.  
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE 
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR 
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH 
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF 
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.   
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, 
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS 
+* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE 
+* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE 
+* OR PERFORMANCE OF THIS SOURCE CODE.  
+*
+* U.S. Government End Users.  This source code is a "commercial item" as 
+* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of 
+* "commercial computer software" and "commercial computer software 
+* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) 
+* and is provided to the U.S. Government only as a commercial end item.  
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through 
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the 
+* source code with only those rights set forth herein.
+*/
+
+
+/* CUda UTility Library */
+
+#ifndef _CUTIL_H_
+#define _CUTIL_H_
+
+#include <cuda_runtime.h>
+
+#ifdef _WIN32
+#   pragma warning( disable : 4996 ) // disable deprecated warning 
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    // helper typedefs for building DLL
+#ifdef _WIN32
+#  ifdef BUILD_DLL
+#    define DLL_MAPPING  __declspec(dllexport)
+#  else
+#    define DLL_MAPPING  __declspec(dllimport)
+#  endif
+#else 
+#  define DLL_MAPPING 
+#endif
+
+#ifdef _WIN32
+    #define CUTIL_API __stdcall
+#else
+    #define CUTIL_API
+#endif
+
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! CUT bool type
+    ////////////////////////////////////////////////////////////////////////////
+    enum CUTBoolean 
+    {
+        CUTFalse = 0,
+        CUTTrue = 1
+    };
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Deallocate memory allocated within Cutil
+    //! @param  pointer to memory 
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    void CUTIL_API
+  	cutFree( void* ptr);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Helper for bank conflict checking (should only be used with the
+    //! CUT_BANK_CHECKER macro)
+    //! @param tidx  thread id in x dimension of block
+    //! @param tidy  thread id in y dimension of block
+    //! @param tidz  thread id in z dimension of block
+    //! @param bdimx block size in x dimension
+    //! @param bdimy block size in y dimension
+    //! @param bdimz block size in z dimension
+    //! @param file  name of the source file where the access takes place
+    //! @param line  line in the source file where the access takes place
+    //! @param aname name of the array which is accessed
+    //! @param index index into the array
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    void CUTIL_API
+    cutCheckBankAccess( unsigned int tidx, unsigned int tidy, unsigned int tidz,
+                        unsigned int bdimx, unsigned int bdimy, 
+                        unsigned int bdimz, const char* file, const int line,
+                        const char* aname, const int index);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Find the path for a filename within a hardcoded set of paths
+    //! @return the path if succeeded, otherwise 0
+    //! @param filename        name of the file
+    //! @param executablePath  optional absolute path of the executable
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    char* CUTIL_API
+    cutFindFilePath(const char* filename, const char* executablePath);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Find the path for a filename within a specified directory tree
+    //! @return the path if succeeded, otherwise 0
+    //! @param filename        name of the file
+    //! @param executablePath  optional absolute path of the executable
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutFindFile(char * outputPath, const char * startDir, const char * dirName);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Find the path for a filename within a specified directory tree
+    //! @return the path if succeeded, otherwise 0
+    //! @param filename        name of the file
+    //! @param executablePath  optional absolute path of the executable
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutFindDir(char * outputPath, const char * startDir, const char * dirName);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Read file \filename containing single precision floating point data
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param filename name of the source file
+    //! @param data  uninitialized pointer, returned initialized and pointing to
+    //!        the data read
+    //! @param len  number of data elements in data, -1 on error
+    //! @note If a NULL pointer is passed to this function and it is
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutReadFilef( const char* filename, float** data, unsigned int* len, 
+                  bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Read file \filename containing double precision floating point data
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param filename name of the source file
+    //! @param data  uninitialized pointer, returned initialized and pointing to
+    //!        the data read
+    //! @param len  number of data elements in data, -1 on error
+    //! @note If a NULL pointer is passed to this function and it is
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutReadFiled( const char* filename, double** data, unsigned int* len, 
+                  bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Read file \filename containing integer data
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param filename name of the source file
+    //! @param data  uninitialized pointer, returned initialized and pointing to
+    //!        the data read
+    //! @param len  number of data elements in data, -1 on error
+    //! @note If a NULL pointer is passed to this function and it is
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Read file \filename containing unsigned integer data
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param filename name of the source file
+    //! @param data  uninitialized pointer, returned initialized and pointing to
+    //!        the data read
+    //! @param len  number of data elements in data, -1 on error
+    //! @note If a NULL pointer is passed to this function and it is 
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutReadFileui( const char* filename, unsigned int** data, 
+                   unsigned int* len, bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Read file \filename containing char / byte data
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param filename name of the source file
+    //! @param data  uninitialized pointer, returned initialized and pointing to
+    //!        the data read
+    //! @param len  number of data elements in data, -1 on error
+    //! @note If a NULL pointer is passed to this function and it is 
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutReadFileb( const char* filename, char** data, unsigned int* len, 
+                  bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Read file \filename containing unsigned char / byte data
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param filename name of the source file
+    //! @param data  uninitialized pointer, returned initialized and pointing to
+    //!        the data read
+    //! @param len  number of data elements in data, -1 on error
+    //! @note If a NULL pointer is passed to this function and it is
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutReadFileub( const char* filename, unsigned char** data, 
+                   unsigned int* len, bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Write a data file \filename containing single precision floating point 
+    //! data
+    //! @return CUTTrue if writing the file succeeded, otherwise false
+    //! @param filename name of the file to write
+    //! @param data  pointer to data to write
+    //! @param len  number of data elements in data, -1 on error
+    //! @param epsilon  epsilon for comparison
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutWriteFilef( const char* filename, const float* data, unsigned int len,
+                   const float epsilon, bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Write a data file \filename containing double precision floating point 
+    //! data
+    //! @return CUTTrue if writing the file succeeded, otherwise false
+    //! @param filename name of the file to write
+    //! @param data  pointer to data to write
+    //! @param len  number of data elements in data, -1 on error
+    //! @param epsilon  epsilon for comparison
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutWriteFiled( const char* filename, const float* data, unsigned int len,
+                   const double epsilon, bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Write a data file \filename containing integer data
+    //! @return CUTTrue if writing the file succeeded, otherwise false
+    //! @param filename name of the file to write
+    //! @param data  pointer to data to write
+    //! @param len  number of data elements in data, -1 on error
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutWriteFilei( const char* filename, const int* data, unsigned int len,
+                   bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Write a data file \filename containing unsigned integer data
+    //! @return CUTTrue if writing the file succeeded, otherwise false
+    //! @param filename name of the file to write
+    //! @param data  pointer to data to write
+    //! @param len  number of data elements in data, -1 on error
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutWriteFileui( const char* filename,const unsigned int* data, 
+                    unsigned int len, bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Write a data file \filename containing char / byte data
+    //! @return CUTTrue if writing the file succeeded, otherwise false
+    //! @param filename name of the file to write
+    //! @param data  pointer to data to write
+    //! @param len  number of data elements in data, -1 on error
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutWriteFileb( const char* filename, const char* data, unsigned int len, 
+                   bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Write a data file \filename containing unsigned char / byte data
+    //! @return CUTTrue if writing the file succeeded, otherwise false
+    //! @param filename name of the file to write
+    //! @param data  pointer to data to write
+    //! @param len  number of data elements in data, -1 on error
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutWriteFileub( const char* filename,const unsigned char* data,
+                    unsigned int len, bool verbose = false);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Load PGM image file (with unsigned char as data element type)
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    //! @note If a NULL pointer is passed to this function and it is 
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Load PPM image file (with unsigned char as data element type)
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutLoadPPMub( const char* file, unsigned char** data, 
+                  unsigned int *w,unsigned int *h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Load PPM image file (with unsigned char as data element type), padding 
+    //! 4th component
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutLoadPPM4ub( const char* file, unsigned char** data, 
+                   unsigned int *w,unsigned int *h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Load PGM image file (with unsigned int as data element type)
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    //! @note If a NULL pointer is passed to this function and it is 
+    //!       initialized within Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+        cutLoadPGMi( const char* file, unsigned int** data, 
+                     unsigned int* w, unsigned int* h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Load PGM image file (with unsigned short as data element type)
+    //! @return CUTTrue if reading the file succeeded, otherwise false
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    //! @note If a NULL pointer is passed to this function and it is 
+    //!       initialized  withing Cutil then cutFree() has to be used to
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+        cutLoadPGMs( const char* file, unsigned short** data, 
+                     unsigned int* w, unsigned int* h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Load PGM image file (with float as data element type)
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    //! @note If a NULL pointer is passed to this function and it is 
+    //!       initialized withing Cutil then cutFree() has to be used to 
+    //!       deallocate the memory
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+        cutLoadPGMf( const char* file, float** data,
+                     unsigned int* w, unsigned int* h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Save PGM image file (with unsigned char as data element type)
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+        cutSavePGMub( const char* file, unsigned char* data, 
+                      unsigned int w, unsigned int h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Save PPM image file (with unsigned char as data element type)
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutSavePPMub( const char* file, unsigned char *data, 
+                unsigned int w, unsigned int h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Save PPM image file (with unsigned char as data element type, padded to 
+    //! 4 bytes)
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutSavePPM4ub( const char* file, unsigned char *data, 
+                   unsigned int w, unsigned int h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Save PGM image file (with unsigned int as data element type)
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutSavePGMi( const char* file, unsigned int* data,
+                 unsigned int w, unsigned int h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Save PGM image file (with unsigned short as data element type)
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutSavePGMs( const char* file, unsigned short* data,
+                 unsigned int w, unsigned int h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Save PGM image file (with float as data element type)
+    //! @param file  name of the image file
+    //! @param data  handle to the data read
+    //! @param w     width of the image
+    //! @param h     height of the image
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutSavePGMf( const char* file, float* data,
+                 unsigned int w, unsigned int h);
+
+    ////////////////////////////////////////////////////////////////////////////
+    // Command line arguments: General notes
+    // * All command line arguments begin with '--' followed by the token; 
+    //   token and value are seperated by '='; example --samples=50
+    // * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+    //   (without whitespaces)
+    ////////////////////////////////////////////////////////////////////////////
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Check if command line argument \a flag-name is given
+    //! @return CUTTrue if command line argument \a flag_name has been given, 
+    //!         otherwise 0
+    //! @param argc  argc as passed to main()
+    //! @param argv  argv as passed to main()
+    //! @param flag_name  name of command line flag
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutCheckCmdLineFlag( const int argc, const char** argv, 
+                         const char* flag_name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Get the value of a command line argument of type int
+    //! @return CUTTrue if command line argument \a arg_name has been given and
+    //!         is of the requested type, otherwise CUTFalse
+    //! @param argc  argc as passed to main()
+    //! @param argv  argv as passed to main()
+    //! @param arg_name  name of the command line argument
+    //! @param val  value of the command line argument
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutGetCmdLineArgumenti( const int argc, const char** argv, 
+                            const char* arg_name, int* val);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Get the value of a command line argument of type float
+    //! @return CUTTrue if command line argument \a arg_name has been given and
+    //!         is of the requested type, otherwise CUTFalse
+    //! @param argc  argc as passed to main()
+    //! @param argv  argv as passed to main()
+    //! @param arg_name  name of the command line argument
+    //! @param val  value of the command line argument
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutGetCmdLineArgumentf( const int argc, const char** argv, 
+                            const char* arg_name, float* val);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Get the value of a command line argument of type string
+    //! @return CUTTrue if command line argument \a arg_name has been given and
+    //!         is of the requested type, otherwise CUTFalse
+    //! @param argc  argc as passed to main()
+    //! @param argv  argv as passed to main()
+    //! @param arg_name  name of the command line argument
+    //! @param val  value of the command line argument
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutGetCmdLineArgumentstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Get the value of a command line argument list those element are strings
+    //! @return CUTTrue if command line argument \a arg_name has been given and
+    //!         is of the requested type, otherwise CUTFalse
+    //! @param argc  argc as passed to main()
+    //! @param argv  argv as passed to main()
+    //! @param arg_name  name of the command line argument
+    //! @param val  command line argument list
+    //! @param len  length of the list / number of elements
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                                  const char* arg_name, char** val, 
+                                  unsigned int* len);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Extended assert
+    //! @return CUTTrue if the condition \a val holds, otherwise CUTFalse
+    //! @param val  condition to test
+    //! @param file  __FILE__ macro
+    //! @param line  __LINE__ macro
+    //! @note This function should be used via the CONDITION(val) macro
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutCheckCondition( int val, const char* file, const int line);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Compare two float arrays
+    //! @return  CUTTrue if \a reference and \a data are identical, 
+    //!          otherwise CUTFalse
+    //! @param reference  handle to the reference data / gold image
+    //! @param data       handle to the computed data
+    //! @param len        number of elements in reference and data
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutComparef( const float* reference, const float* data,
+                 const unsigned int len);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Compare two integer arrays
+    //! @return  CUTTrue if \a reference and \a data are identical, 
+    //!          otherwise CUTFalse
+    //! @param reference  handle to the reference data / gold image
+    //! @param data       handle to the computed data
+    //! @param len        number of elements in reference and data
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutComparei( const int* reference, const int* data, 
+                 const unsigned int len ); 
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Compare two unsigned char arrays
+    //! @return  CUTTrue if \a reference and \a data are identical, 
+    //!          otherwise CUTFalse
+    //! @param reference  handle to the reference data / gold image
+    //! @param data       handle to the computed data
+    //! @param len        number of elements in reference and data
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutCompareub( const unsigned char* reference, const unsigned char* data,
+                  const unsigned int len ); 
+
+    ////////////////////////////////////////////////////////////////////////////////
+    //! Compare two integer arrays witha n epsilon tolerance for equality
+    //! @return  CUTTrue if \a reference and \a data are identical, 
+    //!          otherwise CUTFalse
+    //! @param reference  handle to the reference data / gold image
+    //! @param data       handle to the computed data
+    //! @param len        number of elements in reference and data
+    //! @param epsilon    epsilon to use for the comparison
+    ////////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API
+    cutCompareube( const unsigned char* reference, const unsigned char* data,
+                 const unsigned int len, const int epsilon );
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Compare two float arrays with an epsilon tolerance for equality
+    //! @return  CUTTrue if \a reference and \a data are identical, 
+    //!          otherwise CUTFalse
+    //! @param reference  handle to the reference data / gold image
+    //! @param data       handle to the computed data
+    //! @param len        number of elements in reference and data
+    //! @param epsilon    epsilon to use for the comparison
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutComparefe( const float* reference, const float* data,
+                  const unsigned int len, const float epsilon );
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Compare two float arrays using L2-norm with an epsilon tolerance for 
+    //! equality
+    //! @return  CUTTrue if \a reference and \a data are identical, 
+    //!          otherwise CUTFalse
+    //! @param reference  handle to the reference data / gold image
+    //! @param data       handle to the computed data
+    //! @param len        number of elements in reference and data
+    //! @param epsilon    epsilon to use for the comparison
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutCompareL2fe( const float* reference, const float* data,
+                    const unsigned int len, const float epsilon );
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Timer functionality
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Create a new timer
+    //! @return CUTTrue if a time has been created, otherwise false
+    //! @param  name of the new timer, 0 if the creation failed
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutCreateTimer( unsigned int* name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Delete a timer
+    //! @return CUTTrue if a time has been deleted, otherwise false
+    //! @param  name of the timer to delete
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutDeleteTimer( unsigned int name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Start the time with name \a name
+    //! @param name  name of the timer to start
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutStartTimer( const unsigned int name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Stop the time with name \a name. Does not reset.
+    //! @param name  name of the timer to stop
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutStopTimer( const unsigned int name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Resets the timer's counter.
+    //! @param name  name of the timer to reset.
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    CUTBoolean CUTIL_API 
+    cutResetTimer( const unsigned int name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Returns total execution time in milliseconds for the timer over all 
+    //! runs since the last reset or timer creation.
+    //! @param name  name of the timer to return the time of
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    float CUTIL_API 
+    cutGetTimerValue( const unsigned int name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Return the average time in milliseconds for timer execution as the 
+    //! total  time for the timer dividied by the number of completed (stopped)
+    //! runs the timer has made.
+    //! Excludes the current running time if the timer is currently running.
+    //! @param name  name of the timer to return the time of
+    ////////////////////////////////////////////////////////////////////////////
+    DLL_MAPPING
+    float CUTIL_API 
+    cutGetAverageTimerValue( const unsigned int name);
+
+    ////////////////////////////////////////////////////////////////////////////
+    //! Macros
+
+#ifdef _DEBUG
+
+#if __DEVICE_EMULATION__
+    // Interface for bank conflict checker
+#define CUT_BANK_CHECKER( array, index)                                      \
+    (cutCheckBankAccess( threadIdx.x, threadIdx.y, threadIdx.z, blockDim.x,  \
+    blockDim.y, blockDim.z,                                                  \
+    __FILE__, __LINE__, #array, index ),                                     \
+    array[index])
+#else
+#define CUT_BANK_CHECKER( array, index)  array[index]
+#endif
+
+#  define CU_SAFE_CALL_NO_SYNC( call ) do {                                  \
+    CUresult err = call;                                                     \
+    if( CUDA_SUCCESS != err) {                                               \
+        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CU_SAFE_CALL( call ) do {                                          \
+    CU_SAFE_CALL_NO_SYNC(call);                                              \
+    CUresult err = cuCtxSynchronize();                                       \
+    if( CUDA_SUCCESS != err) {                                               \
+        fprintf(stderr, "Cuda driver error %x in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUDA_SAFE_CALL( call) do {                                         \
+    CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+    cudaError err = cudaThreadSynchronize();                                 \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUFFT_SAFE_CALL( call) do {                                        \
+    cufftResult err = call;                                                  \
+    if( CUFFT_SUCCESS != err) {                                              \
+        fprintf(stderr, "CUFFT error in file '%s' in line %i.\n",            \
+                __FILE__, __LINE__);                                         \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUT_SAFE_CALL( call)                                               \
+    if( CUTTrue != call) {                                                   \
+        fprintf(stderr, "Cut error in file '%s' in line %i.\n",              \
+                __FILE__, __LINE__);                                         \
+        exit(EXIT_FAILURE);                                                  \
+    } 
+
+    //! Check for CUDA error
+#  define CUT_CHECK_ERROR(errorMessage) do {                                 \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    err = cudaThreadSynchronize();                                           \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+    //! Check for malloc error
+#  define CUT_SAFE_MALLOC( mallocCall ) do{                                  \
+    if( !(mallocCall)) {                                                     \
+        fprintf(stderr, "Host malloc failure in file '%s' in line %i\n",     \
+                __FILE__, __LINE__);                                         \
+        exit(EXIT_FAILURE);                                                  \
+    } } while(0);
+
+    //! Check if conditon is true (flexible assert)
+#  define CUT_CONDITION( val)                                                \
+    if( CUTFalse == cutCheckCondition( val, __FILE__, __LINE__)) {           \
+        exit(EXIT_FAILURE);                                                  \
+    }
+
+#else  // not DEBUG
+
+#define CUT_BANK_CHECKER( array, index)  array[index]
+
+    // void macros for performance reasons
+#  define CUT_CHECK_ERROR(errorMessage)
+#  define CUT_CHECK_ERROR_GL()
+#  define CUT_CONDITION( val) 
+#  define CU_SAFE_CALL_NO_SYNC( call) call
+#  define CU_SAFE_CALL( call) call
+#  define CUDA_SAFE_CALL_NO_SYNC( call) call
+#  define CUDA_SAFE_CALL( call) call
+#  define CUT_SAFE_CALL( call) call
+#  define CUFFT_SAFE_CALL( call) call
+#  define CUT_SAFE_MALLOC( mallocCall ) mallocCall
+
+#endif
+
+#if __DEVICE_EMULATION__
+
+#  define CUT_DEVICE_INIT(ARGC, ARGV)
+
+#else
+
+#  define CUT_DEVICE_INIT(ARGC, ARGV) {                                      \
+    int deviceCount;                                                         \
+    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));                \
+    if (deviceCount == 0) {                                                  \
+        fprintf(stderr, "cutil error: no devices supporting CUDA.\n");       \
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    int dev = 0;                                                             \
+    cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);      \
+    if (dev > deviceCount-1) dev = deviceCount - 1;                          \
+    cudaDeviceProp deviceProp;                                               \
+    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&deviceProp, dev));       \
+    if (deviceProp.major < 1) {                                              \
+        fprintf(stderr, "cutil error: device does not support CUDA.\n");     \
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
+        fprintf(stderr, "Using device %d: %s\n", dev, deviceProp.name);       \
+    CUDA_SAFE_CALL(cudaSetDevice(dev));                                      \
+}
+
+#endif
+
+#  define CUT_DEVICE_INIT_DRV(cuDevice, ARGC, ARGV) {                        \
+    cuDevice = 0;                                                            \
+    int deviceCount = 0;                                                     \
+    CUresult err = cuInit(0);                                                \
+    if (CUDA_SUCCESS == err)                                                 \
+        CU_SAFE_CALL_NO_SYNC(cuDeviceGetCount(&deviceCount));                \
+    if (deviceCount == 0) {                                                  \
+        fprintf(stderr, "cutil error: no devices supporting CUDA\n");        \
+        exit(EXIT_FAILURE);                                                  \
+    }                                                                        \
+    int dev = 0;                                                             \
+    cutGetCmdLineArgumenti(ARGC, (const char **) ARGV, "device", &dev);      \
+    if (dev > deviceCount-1) dev = deviceCount - 1;                          \
+    CU_SAFE_CALL_NO_SYNC(cuDeviceGet(&cuDevice, dev));                       \
+    char name[100];                                                          \
+    cuDeviceGetName(name, 100, cuDevice);                                    \
+    if (cutCheckCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == CUTFalse) \
+        fprintf(stderr, "Using device %d: %s\n", dev, name);                  \
+}
+
+#define CUT_EXIT(argc, argv)                                                 \
+    if (!cutCheckCmdLineFlag(argc, (const char**)argv, "noprompt")) {        \
+        printf("\nPress ENTER to exit...\n");                                \
+        fflush( stdout);                                                     \
+        fflush( stderr);                                                     \
+        getchar();                                                           \
+    }                                                                        \
+    exit(EXIT_SUCCESS);
+
+
+#ifdef __cplusplus
+}
+#endif  // #ifdef _DEBUG (else branch)
+
+#endif  // #ifndef _CUTIL_H_
--- a/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu
+++ b/lib/gpu/cudpp_mini/kernel/radixsort_kernel.cu
@ -0,0 +1,868 @@
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+#include "cudpp_radixsort.h"
+#include <cudpp_globals.h>
+#include "sharedmem.h"
+#include "cta/radixsort_cta.cu"
+
+#ifdef __DEVICE_EMULATION__
+#define __EMUSYNC  __syncthreads()
+#else
+#define __EMUSYNC
+#endif
+
+/**
+ * @file
+ * radixsort_app.cu
+ *   
+ * @brief CUDPP kernel-level radix sorting routines
+ */
+
+/** \addtogroup cudpp_kernel
+  * @{
+ */
+
+/** @name RadixSort Functions
+ * @{
+ */
+
+
+
+typedef unsigned int uint;
+
+/** @brief And empty kernel used to reset CTA issue hardware
+ **/
+__global__ void emptyKernel() {}
+
+
+/** @brief Does special binary arithmetic before sorting floats
+ * 
+ * Uses floatFlip function to flip bits.
+ * @param[in,out] values  Values to be manipulated
+ * @param[in] numValues Number of values to be flipped 
+ **/
+
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+flipFloats(uint *values, uint numValues)
+{
+    uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; 
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatFlip<true>(values[index]);
+}
+
+/** @brief Undoes the flips from flipFloats
+ * 
+ * Uses floatUnflip function to unflip bits.
+ * @param[in,out] values  Values to be manipulated
+ * @param[in] numValues Number of values to be unflipped 
+ **/
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+unflipFloats(uint *values, uint numValues)
+{
+    uint index = __umul24(blockDim.x*4, blockIdx.x) + threadIdx.x; 
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+    index += blockDim.x;
+    if (index < numValues) values[index] = floatUnflip<true>(values[index]);
+}
+
+
+/** @brief Optimization for sorts of WARP_SIZE or fewer elements
+ * 
+ * @param[in,out] keys  Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] numElements Number of elements in the sort.
+ */
+template <bool flip>
+__global__ 
+LAUNCH_BOUNDS(WARP_SIZE)
+void radixSortSingleWarp(uint *keys, 
+                         uint *values, 
+                         uint numElements)
+{
+    volatile __shared__ uint sKeys[WARP_SIZE]; //remove class distinctions
+    volatile __shared__ uint sValues[WARP_SIZE];
+    volatile __shared__ uint sFlags[WARP_SIZE];
+
+    sKeys[threadIdx.x]   = floatFlip<flip>(keys[threadIdx.x]);
+    sValues[threadIdx.x] = values[threadIdx.x];
+    
+    __EMUSYNC; // emulation only
+
+    for(uint i = 1; i < numElements; i++)
+    {
+        uint key_i = sKeys[i];
+        uint val_i = sValues[i];
+        
+        sFlags[threadIdx.x] = 0;
+      
+        uint temp, tempval;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+            temp = sKeys[threadIdx.x];
+            tempval = sValues[threadIdx.x];
+            sFlags[threadIdx.x] = 1;
+
+#ifdef __DEVICE_EMULATION__
+        }
+        __EMUSYNC;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+#endif
+            sKeys[threadIdx.x + 1] = temp;
+            sValues[threadIdx.x + 1] = tempval;
+            sFlags[threadIdx.x + 1] = 0;
+        }
+
+        
+        if(sFlags[threadIdx.x] == 1 )
+        {
+            sKeys[threadIdx.x] = key_i;
+            sValues[threadIdx.x] = val_i;
+        }
+
+        __EMUSYNC; // emulation only
+
+    }
+    keys[threadIdx.x]   = floatUnflip<flip>(sKeys[threadIdx.x]);
+    values[threadIdx.x] = sValues[threadIdx.x];
+}
+
+
+/** @brief Optimization for sorts of WARP_SIZE or fewer elements. Keys-Only version.
+ *
+ * @param[in,out] keys Keys to be sorted
+ * @param[in] numElements Total number of elements to be sorted
+**/
+
+template <bool flip>
+__global__ 
+LAUNCH_BOUNDS(WARP_SIZE)
+void radixSortSingleWarpKeysOnly(uint *keys, 
+                                 uint numElements)
+{
+    volatile __shared__ uint sKeys[WARP_SIZE];
+    volatile __shared__ uint sFlags[WARP_SIZE];
+
+    sKeys[threadIdx.x]   = floatFlip<flip>(keys[threadIdx.x]);
+    
+    __EMUSYNC; // emulation only
+
+    for(uint i = 1; i < numElements; i++)
+    {
+        uint key_i = sKeys[i];
+        
+        sFlags[threadIdx.x] = 0;
+        
+        uint temp;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+            temp = sKeys[threadIdx.x];
+            sFlags[threadIdx.x] = 1;
+#ifdef __DEVICE_EMULATION__
+        }
+        __EMUSYNC;
+        if( (threadIdx.x < i) && (sKeys[threadIdx.x] > key_i) ) 
+        {
+#endif
+            sKeys[threadIdx.x + 1] = temp;
+            sFlags[threadIdx.x + 1] = 0;
+        }
+        if(sFlags[threadIdx.x] == 1 )
+        {
+            sKeys[threadIdx.x] = key_i;
+        }
+
+        __EMUSYNC; // emulation only
+
+    }
+    keys[threadIdx.x]   = floatUnflip<flip>(sKeys[threadIdx.x]);
+}
+
+/** @brief sorts all blocks of data independently in shared memory.  
+* Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
+* 
+* The radix sort is done in two stages.  This stage calls radixSortBlock on each 
+* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
+* 
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
+* differently than arrays that are not.  "flip" is used to only compile in the
+* float flip code when float keys are used.  "loop" is used when persistent CTAs
+* are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[out] keysOut Output of sorted keys 
+* @param[out] valuesOut Output of associated values 
+* @param[in]  keysIn Input of unsorted keys in GPU 
+* @param[in]  valuesIn Input of associated input values 
+* @param[in]  numElements Total number of elements to sort
+* @param[in]  totalBlocks The number of blocks of data to sort
+*/
+template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+radixSortBlocks(uint4* keysOut, uint4* valuesOut, 
+                                uint4* keysIn, uint4* valuesIn, 
+                                uint numElements, uint totalBlocks)
+{
+    extern __shared__ uint4 sMem[];
+
+    uint4 key, value;
+
+
+    uint blockId = blockIdx.x;
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+        uint idx = i << 2;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if (!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx >= numElements)
+            {
+                key   = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+                value = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+            }
+            else
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1    = (uint*)keysIn;
+                uint *values1  = (uint*)valuesIn;
+
+                key.x = (idx   < numElements) ? floatFlip<flip>(keys1[idx])   : UINT_MAX;
+                key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
+                key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
+                key.w = UINT_MAX;
+
+                value.x = (idx   < numElements) ? values1[idx]   : UINT_MAX;
+                value.y = (idx+1 < numElements) ? values1[idx+1] : UINT_MAX;
+                value.z = (idx+2 < numElements) ? values1[idx+2] : UINT_MAX;
+                value.w = UINT_MAX;
+            }
+        }
+        else
+        {
+            key = keysIn[i];
+            value = valuesIn[i];
+
+            if (flip)
+            {
+                key.x = floatFlip<flip>(key.x);
+                key.y = floatFlip<flip>(key.y);
+                key.z = floatFlip<flip>(key.z);
+                key.w = floatFlip<flip>(key.w);
+            }
+        }
+        __syncthreads();
+        radixSortBlock<nbits, startbit>(key, value);
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx < numElements) 
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1   = (uint*)keysOut;
+                uint *values1 = (uint*)valuesOut;
+
+                keys1[idx]   = key.x;
+                values1[idx] = value.x;
+
+                if (idx + 1 < numElements)
+                {
+                    keys1[idx + 1]   = key.y;
+                    values1[idx + 1] = value.y;
+
+                    if (idx + 2 < numElements)
+                    {
+                        keys1[idx + 2]   = key.z;
+                        values1[idx + 2] = value.z;
+                    }
+                }
+            }
+        }
+        else
+        {
+            keysOut[i]   = key;
+            valuesOut[i] = value;
+        }
+
+        if (loop)        
+            blockId += gridDim.x;
+        else
+            break;            
+    }
+}
+
+/** @brief Computes the number of keys of each radix in each block stores offset.
+*
+* Given an array with blocks sorted according to a 4-bit radix group, each 
+* block counts the number of keys that fall into each radix in the group, and 
+* finds the starting offset of each radix in the block.  It then writes the radix 
+* counts to the counters array, and the starting offsets to the blockOffsets array.
+*
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size 
+* (fullBlocks) differently than arrays that are not. "loop" is used when persistent 
+* CTAs are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[in] keys Input keys
+* @param[out] counters Radix count for each block
+* @param[out] blockOffsets The offset address for each block
+* @param[in] numElements Total number of elements
+* @param[in] totalBlocks Total number of blocks
+**/
+template<uint startbit, bool fullBlocks, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+findRadixOffsets(uint2 *keys, 
+                 uint  *counters, 
+                 uint  *blockOffsets, 
+                 uint   numElements,
+                 uint   totalBlocks)
+{
+    extern __shared__ uint sRadix1[];
+    __shared__ uint  sStartPointers[16];
+
+    uint blockId = blockIdx.x;   
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint2 radix2;
+
+        uint i       = blockId * blockDim.x + threadIdx.x;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && ((i + 1) << 1 ) > numElements )
+        {
+            // handle uint1 rather than uint2 for non-full blocks
+            uint *keys1 = (uint*)keys;
+            uint j = i << 1; 
+
+            radix2.x = (j < numElements) ? keys1[j] : UINT_MAX; 
+            j++;
+            radix2.y = (j < numElements) ? keys1[j] : UINT_MAX;
+        }
+        else
+        {
+            radix2 = keys[i];
+        }
+
+        sRadix1[2 * threadIdx.x]     = (radix2.x >> startbit) & 0xF;
+        sRadix1[2 * threadIdx.x + 1] = (radix2.y >> startbit) & 0xF;
+
+        // Finds the position where the sRadix1 entries differ and stores start 
+        // index for each radix.
+        if(threadIdx.x < 16) 
+        { 
+            sStartPointers[threadIdx.x] = 0; 
+        }
+        __syncthreads();
+
+        if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) 
+        {
+            sStartPointers[sRadix1[threadIdx.x]] = threadIdx.x;
+        }
+        if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]) 
+        {
+            sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE]] = threadIdx.x + SORT_CTA_SIZE;
+        }
+        __syncthreads();
+
+        if(threadIdx.x < 16) 
+        {
+            blockOffsets[blockId*16 + threadIdx.x] = sStartPointers[threadIdx.x];
+        }
+        __syncthreads();
+
+        // Compute the sizes of each block.
+        if((threadIdx.x > 0) && (sRadix1[threadIdx.x] != sRadix1[threadIdx.x - 1]) ) 
+        {
+            sStartPointers[sRadix1[threadIdx.x - 1]] = 
+                threadIdx.x - sStartPointers[sRadix1[threadIdx.x - 1]];
+        }
+        if(sRadix1[threadIdx.x + SORT_CTA_SIZE] != sRadix1[threadIdx.x + SORT_CTA_SIZE - 1] ) 
+        {
+            sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]] = 
+                threadIdx.x + SORT_CTA_SIZE - sStartPointers[sRadix1[threadIdx.x + SORT_CTA_SIZE - 1]];
+        }
+
+
+        if(threadIdx.x == SORT_CTA_SIZE - 1) 
+        {
+            sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]] = 
+                2 * SORT_CTA_SIZE - sStartPointers[sRadix1[2 * SORT_CTA_SIZE - 1]];
+        }
+        __syncthreads();
+
+        if(threadIdx.x < 16) 
+        {
+            counters[threadIdx.x * totalBlocks + blockId] = 
+                sStartPointers[threadIdx.x];
+        }
+
+        if (loop)
+            blockId += gridDim.x;
+        else
+            break;
+    }
+}
+
+
+/**@brief Reorders data in the global array.
+*
+* reorderData shuffles data in the array globally after the radix
+* offsets have been found. On compute version 1.1 and earlier GPUs, this code depends 
+* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+* 
+* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+* that all writes are coalesced using extra work in the kernel.  On later
+* GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+* performance.  On these GPUs we set manualCoalesce=false and directly store
+* the results.
+*
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size 
+* (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+* CTAs are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+*
+* @param[out] outKeys Output of sorted keys 
+* @param[out] outValues Output of associated values 
+* @param[in] keys Input of unsorted keys in GPU 
+* @param[in] values Input of associated input values 
+* @param[in] blockOffsets The offset address for each block
+* @param[in] offsets Address of each radix within each block
+* @param[in] sizes Number of elements in a block
+* @param[in] numElements Total number of elements
+* @param[in] totalBlocks Total number of data blocks to process
+*
+* @todo Args that are const below should be prototyped as const
+**/
+template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+reorderData(uint  *outKeys, 
+            uint  *outValues, 
+            uint2 *keys, 
+            uint2 *values, 
+            uint  *blockOffsets, 
+            uint  *offsets, 
+            uint  *sizes, 
+            uint   numElements,
+            uint   totalBlocks)
+{
+    __shared__ uint2 sKeys2[SORT_CTA_SIZE];
+    __shared__ uint2 sValues2[SORT_CTA_SIZE];
+    __shared__ uint sOffsets[16];
+    __shared__ uint sBlockOffsets[16];
+
+    uint *sKeys1   = (uint*)sKeys2; 
+    uint *sValues1 = (uint*)sValues2; 
+
+    uint blockId = blockIdx.x;   
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && (((i + 1) << 1) > numElements))
+        {
+            uint *keys1   = (uint*)keys;
+            uint *values1 = (uint*)values;
+            uint j = i << 1; 
+
+            sKeys1[threadIdx.x << 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+            sValues1[threadIdx.x << 1] = (j < numElements) ? values1[j] : UINT_MAX; 
+            j++; 
+            sKeys1[(threadIdx.x << 1) + 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+            sValues1[(threadIdx.x << 1) + 1] = (j < numElements) ? values1[j] : UINT_MAX; 
+        }
+        else
+        {
+            sKeys2[threadIdx.x]   = keys[i];
+            sValues2[threadIdx.x] = values[i];
+        }
+
+        if (!manualCoalesce)
+        {
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+            }
+            __syncthreads();
+
+            uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
+            uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x]);
+                outValues[globalOffset] = sValues1[threadIdx.x];
+            }
+
+            radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
+            globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
+                outValues[globalOffset] = sValues1[threadIdx.x + SORT_CTA_SIZE];
+            }
+        }
+        else
+        {
+            __shared__ uint sSizes[16];
+
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+                sSizes[threadIdx.x]        = sizes[threadIdx.x * totalBlocks + blockId];
+            }
+            __syncthreads();
+
+            // 1 half-warp is responsible for writing out all values for 1 radix. 
+            // Loops if there are more than 16 values to be written out. 
+            // All start indices are rounded down to the nearest multiple of 16, and
+            // all end indices are rounded up to the nearest multiple of 16.
+            // Thus it can do extra work if the start and end indices are not multiples of 16
+            // This is bounded by a factor of 2 (it can do 2X more work at most).
+
+            const uint halfWarpID     = threadIdx.x >> 4;
+
+            const uint halfWarpOffset = threadIdx.x & 0xF;
+            const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
+
+            uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
+            uint endPos   = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - 
+                ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
+            uint numIterations = endPos - startPos;
+
+            uint outOffset = startPos + halfWarpOffset;
+            uint inOffset  = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
+
+            for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
+            {       
+                if( (outOffset >= sOffsets[halfWarpID]) && 
+                    (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) 
+                {
+                    if(blockId < totalBlocks - 1 || outOffset < numElements) 
+                    {
+                        outKeys[outOffset]   = floatUnflip<unflip>(sKeys1[inOffset]);
+                        outValues[outOffset] = sValues1[inOffset];
+                    }
+                }       
+            }
+        }
+
+        if (loop)
+        {
+            blockId += gridDim.x;
+            __syncthreads();
+        }
+        else
+            break;
+    }
+}
+
+/** @brief Sorts all blocks of data independently in shared memory.  
+*  Each thread block (CTA) sorts one block of 4*CTA_SIZE elements
+* 
+* The radix sort is done in two stages.  This stage calls radixSortBlock on each 
+* block independently, sorting on the basis of bits (startbit) -> (startbit + nbits)
+* 
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size (fullBlocks)
+* differently than arrays that are not.  "flip" is used to only compile in the
+* float flip code when float keys are used.  "loop" is used when persistent CTAs
+* are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[out] keysOut Output of sorted keys GPU main memory
+* @param[in] keysIn Input of unsorted keys in GPU main memory
+* @param[in] numElements Total number of elements to sort
+* @param[in] totalBlocks Total number of blocks to sort
+*
+*/
+template<uint nbits, uint startbit, bool fullBlocks, bool flip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+radixSortBlocksKeysOnly(uint4* keysOut, uint4* keysIn, uint numElements, uint totalBlocks)
+{
+    extern __shared__ uint4 sMem[];
+
+    uint4 key;
+
+    uint blockId = blockIdx.x;
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+        uint idx = i << 2;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if (!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx >= numElements)
+            {
+                key   = make_uint4(UINT_MAX, UINT_MAX, UINT_MAX, UINT_MAX);
+            }
+            else
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1    = (uint*)keysIn;
+
+                key.x = (idx   < numElements) ? floatFlip<flip>(keys1[idx])   : UINT_MAX;
+                key.y = (idx+1 < numElements) ? floatFlip<flip>(keys1[idx+1]) : UINT_MAX;
+                key.z = (idx+2 < numElements) ? floatFlip<flip>(keys1[idx+2]) : UINT_MAX;
+                key.w = UINT_MAX;
+            }
+        }
+        else
+        {
+            key = keysIn[i];
+            if (flip)
+            {
+                key.x = floatFlip<flip>(key.x);
+                key.y = floatFlip<flip>(key.y);
+                key.z = floatFlip<flip>(key.z);
+                key.w = floatFlip<flip>(key.w);
+            }            
+        }
+        __syncthreads();
+        radixSortBlockKeysOnly<nbits, startbit>(key);
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && idx+3 >= numElements)
+        {
+            if (idx < numElements) 
+            {
+                // for non-full block, we handle uint1 values instead of uint4
+                uint *keys1   = (uint*)keysOut;
+
+                keys1[idx]   = key.x;
+
+                if (idx + 1 < numElements)
+                {
+                    keys1[idx + 1]   = key.y;
+
+                    if (idx + 2 < numElements)
+                    {
+                        keys1[idx + 2]   = key.z;
+                    }
+                }
+            }
+        }
+        else
+        {
+            keysOut[i]   = key;
+        }
+
+        if (loop)
+            blockId += gridDim.x;
+        else
+            break;
+    }
+}
+
+/** @brief Reorders data in the global array.
+*
+* reorderDataKeysOnly shuffles data in the array globally after the radix offsets 
+* have been found. On compute version 1.1 and earlier GPUs, this code depends 
+* on SORT_CTA_SIZE being 16 * number of radices (i.e. 16 * 2^nbits).
+* 
+* On compute version 1.1 GPUs ("manualCoalesce=true") this function ensures
+* that all writes are coalesced using extra work in the kernel.  On later
+* GPUs coalescing rules have been relaxed, so this extra overhead hurts 
+* performance.  On these GPUs we set manualCoalesce=false and directly store
+* the results.
+*
+* Template parameters are used to generate efficient code for various special cases
+* For example, we have to handle arrays that are a multiple of the block size 
+* (fullBlocks) differently than arrays that are not.  "loop" is used when persistent 
+* CTAs are used. 
+*
+* By persistent CTAs we mean that we launch only as many thread blocks as can 
+* be resident in the GPU and no more, rather than launching as many threads as
+* we have elements. Persistent CTAs loop over blocks of elements until all work
+* is complete.  This can be faster in some cases.  In our tests it is faster
+* for large sorts (and the threshold is higher on compute version 1.1 and earlier
+* GPUs than it is on compute version 1.2 GPUs.
+* 
+* @param[out] outKeys Output result of reorderDataKeysOnly()
+* @param[in] keys Keys to be reordered
+* @param[in] blockOffsets Start offset for each block
+* @param[in] offsets Offset of each radix within each block
+* @param[in] sizes Number of elements in a block
+* @param[in] numElements Total number of elements
+* @param[in] totalBlocks Total number of blocks
+*/
+template<uint startbit, bool fullBlocks, bool manualCoalesce, bool unflip, bool loop>
+__global__ void 
+LAUNCH_BOUNDS(SORT_CTA_SIZE)
+reorderDataKeysOnly(uint  *outKeys, 
+                                    uint2 *keys, 
+                                    uint  *blockOffsets, 
+                                    uint  *offsets, 
+                                    uint  *sizes, 
+                                    uint   numElements,
+                                    uint   totalBlocks)
+{
+    __shared__ uint2 sKeys2[SORT_CTA_SIZE];
+    __shared__ uint sOffsets[16];
+    __shared__ uint sBlockOffsets[16];
+
+    uint *sKeys1   = (uint*)sKeys2; 
+
+    uint blockId = blockIdx.x;
+
+    while (!loop || blockId < totalBlocks)
+    {
+        uint i = blockId * blockDim.x + threadIdx.x;
+
+        // handle non-full last block if array is not multiple of 1024 numElements
+        if(!fullBlocks && (((i + 1) << 1) > numElements))
+        {
+            uint *keys1   = (uint*)keys;
+            uint j = i << 1; 
+
+            sKeys1[threadIdx.x << 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+            j++; 
+            sKeys1[(threadIdx.x << 1) + 1]   = (j < numElements) ? keys1[j]   : UINT_MAX; 
+        }
+        else
+        {
+            sKeys2[threadIdx.x]   = keys[i];
+        }
+
+        if (!manualCoalesce)
+        {
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+            }
+            __syncthreads();
+
+            uint radix = (sKeys1[threadIdx.x] >> startbit) & 0xF;
+            uint globalOffset = sOffsets[radix] + threadIdx.x - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x]);
+            }
+
+            radix = (sKeys1[threadIdx.x + SORT_CTA_SIZE] >> startbit) & 0xF;
+            globalOffset = sOffsets[radix] + threadIdx.x + SORT_CTA_SIZE - sBlockOffsets[radix];
+
+            if (fullBlocks || globalOffset < numElements)
+            {
+                outKeys[globalOffset]   = floatUnflip<unflip>(sKeys1[threadIdx.x + SORT_CTA_SIZE]);
+            }
+        }
+        else
+        {
+            __shared__ uint sSizes[16];
+
+            if(threadIdx.x < 16)  
+            {
+                sOffsets[threadIdx.x]      = offsets[threadIdx.x * totalBlocks + blockId];
+                sBlockOffsets[threadIdx.x] = blockOffsets[blockId * 16 + threadIdx.x];
+                sSizes[threadIdx.x]        = sizes[threadIdx.x * totalBlocks + blockId];
+            }
+            __syncthreads();
+
+            // 1 half-warp is responsible for writing out all values for 1 radix. 
+            // Loops if there are more than 16 values to be written out. 
+            // All start indices are rounded down to the nearest multiple of 16, and
+            // all end indices are rounded up to the nearest multiple of 16.
+            // Thus it can do extra work if the start and end indices are not multiples of 16
+            // This is bounded by a factor of 2 (it can do 2X more work at most).
+
+            const uint halfWarpID     = threadIdx.x >> 4;
+
+            const uint halfWarpOffset = threadIdx.x & 0xF;
+            const uint leadingInvalid = sOffsets[halfWarpID] & 0xF;
+
+            uint startPos = sOffsets[halfWarpID] & 0xFFFFFFF0;
+            uint endPos   = (sOffsets[halfWarpID] + sSizes[halfWarpID]) + 15 - 
+                ((sOffsets[halfWarpID] + sSizes[halfWarpID] - 1) & 0xF);
+            uint numIterations = endPos - startPos;
+
+            uint outOffset = startPos + halfWarpOffset;
+            uint inOffset  = sBlockOffsets[halfWarpID] - leadingInvalid + halfWarpOffset;
+
+            for(uint j = 0; j < numIterations; j += 16, outOffset += 16, inOffset += 16)
+            {       
+                if( (outOffset >= sOffsets[halfWarpID]) && 
+                    (inOffset - sBlockOffsets[halfWarpID] < sSizes[halfWarpID])) 
+                {
+                    if(blockId < totalBlocks - 1 || outOffset < numElements) 
+                    {
+                        outKeys[outOffset] = floatUnflip<unflip>(sKeys1[inOffset]);
+                    }
+                }       
+            }
+        }
+
+        if (loop)
+        {
+            blockId += gridDim.x;
+            __syncthreads();
+        }
+        else
+            break;
+    }
+}
+
+/** @} */ // end radixsort functions
+/** @} */ // end cudpp_kernel
--- a/lib/gpu/cudpp_mini/kernel/scan_kernel.cu
+++ b/lib/gpu/cudpp_mini/kernel/scan_kernel.cu
@ -0,0 +1,113 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+//  $Revision: 5633 $
+//  $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * scan_kernel.cu
+ *
+ * @brief CUDPP kernel-level scan routines
+ */
+
+/** \defgroup cudpp_kernel CUDPP Kernel-Level API
+  * The CUDPP Kernel-Level API contains functions that run on the GPU 
+  * device across a grid of Cooperative Thread Array (CTA, aka Thread
+  * Block).  These kernels are declared \c __global__ so that they 
+  * must be invoked from host (CPU) code.  They generally invoke GPU 
+  * \c __device__ routines in the CUDPP \link cudpp_cta CTA-Level API\endlink. 
+  * Kernel-Level API functions are used by CUDPP 
+  * \link cudpp_app Application-Level\endlink functions to implement their 
+  * functionality.
+  * @{
+  */
+
+/** @name Scan Functions
+* @{
+*/
+
+#include <cudpp_globals.h>
+#include "cta/scan_cta.cu"
+#include "sharedmem.h"
+
+/**
+  * @brief Main scan kernel
+  *
+  * This __global__ device function performs one level of a multiblock scan on 
+  * an arbitrary-dimensioned array in \a d_in, returning the result in \a d_out 
+  * (which may point to the same array).  The same function may be used for
+  * single or multi-row scans.  To perform a multirow scan, pass the width of 
+  * each row of the input row (in elements) in \a dataRowPitch, and the width of 
+  * the rows of \a d_blockSums (in elements) in \a blockSumRowPitch, and invoke
+  * with a thread block grid with height greater than 1.  
+  * 
+  * This function peforms one level of a recursive, multiblock scan.  At the 
+  * app level, this function is called by cudppScan and cudppMultiScan and used 
+  * in combination with vectorAddUniform4() to produce a complete scan.
+  *
+  * Template parameter \a T is the datatype of the array to be scanned. 
+  * Template parameter \a traits is the ScanTraits struct containing 
+  * compile-time options for the scan, such as whether it is forward or 
+  * backward, exclusive or inclusive, multi- or single-row, etc.
+  * 
+  * @param[out] d_out The output (scanned) array
+  * @param[in]  d_in The input array to be scanned
+  * @param[out] d_blockSums The array of per-block sums
+  * @param[in]  numElements The number of elements to scan
+  * @param[in]  dataRowPitch The width of each row of \a d_in in elements 
+  * (for multi-row scans)
+  * @param[in]  blockSumRowPitch The with of each row of \a d_blockSums in elements
+  * (for multi-row scans)
+  */
+template<class T, class traits> 
+__global__ void scan4(T            *d_out, 
+                      const T      *d_in, 
+                      T            *d_blockSums, 
+                      int          numElements, 
+                      unsigned int dataRowPitch,
+                      unsigned int blockSumRowPitch)
+{
+    SharedMemory<T> smem;
+    T* temp = smem.getPointer();
+
+    int devOffset, ai, bi, aiDev, biDev;
+    T threadScan0[4], threadScan1[4];
+
+    unsigned int blockN = numElements;
+    unsigned int blockSumIndex = blockIdx.x;
+
+    if (traits::isMultiRow())
+    {
+        //int width = __mul24(gridDim.x, blockDim.x) << 1;
+        int yIndex     = __umul24(blockDim.y, blockIdx.y) + threadIdx.y;
+        devOffset      = __umul24(dataRowPitch, yIndex);
+        blockN        += (devOffset << 2);
+        devOffset     += __umul24(blockIdx.x, blockDim.x << 1);
+        blockSumIndex += __umul24(blockSumRowPitch << 2, yIndex) ;
+    }
+    else
+    {
+        devOffset = __umul24(blockIdx.x, (blockDim.x << 1));
+    }
+    
+    // load data into shared memory
+    loadSharedChunkFromMem4<T, traits>
+        (temp, threadScan0, threadScan1, d_in,
+         blockN, devOffset, ai, bi, aiDev, biDev);
+
+    scanCTA<T, traits>(temp, d_blockSums, blockSumIndex);
+    
+    // write results to device memory
+    storeSharedChunkToMem4<T, traits>
+        (d_out, threadScan0, threadScan1, temp, 
+         blockN, devOffset, ai, bi, aiDev, biDev);
+
+}
+
+/** @} */ // end scan functions
+/** @} */ // end cudpp_kernel
--- a/lib/gpu/cudpp_mini/kernel/vector_kernel.cu
+++ b/lib/gpu/cudpp_mini/kernel/vector_kernel.cu
@ -0,0 +1,469 @@
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+//  $Revision: 5632 $
+//  $Date: 2009-07-01 14:36:01 +1000 (Wed, 01 Jul 2009) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt in
+// the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * vector_kernel.cu
+ * 
+ * @brief CUDA kernel methods for basic operations on vectors.  
+ * 
+ * CUDA kernel methods for basic operations on vectors.  
+ * 
+ * Examples: 
+ * - vectorAddConstant(): d_vector + constant 
+ * - vectorAddUniform():  d_vector + uniform (per-block constants)
+ * - vectorAddVectorVector(): d_vector + d_vector
+ */
+
+// MJH: these functions assume there are 2N elements for N threads.  
+// Is this always going to be a good idea?  There may be cases where
+// we have as many threads as elements, but for large problems
+// we are probably limited by max CTA size for simple kernels like 
+// this so we should process multiple elements per thread.
+// we may want to extend these with looping versions that process 
+// many elements per thread.
+
+#include "cudpp_util.h"
+#include "sharedmem.h"
+#include "cudpp.h"
+
+/** \addtogroup cudpp_kernel
+  * @{
+  */
+
+/** @name Vector Functions
+ * CUDA kernel methods for basic operations on vectors.  
+ * @{
+ */
+
+/** @brief Adds a constant value to all values in the input d_vector
+ *  
+ * Each thread adds two pairs of elements.
+ * @todo Test this function -- it is currently not yet used.
+ *
+ * @param[in,out] d_vector The array of elements to be modified
+ * @param[in] constant The constant value to be added to elements of 
+ * \a d_vector
+ * @param[in] n The number of elements in the d_vector to be modified
+ * @param[in] baseIndex An optional offset to the beginning of the
+ * elements in the input array to be processed
+ */
+template <class T>
+__global__  void vectorAddConstant(T   *d_vector, 
+                                   T   constant, 
+                                   int n, 
+                                   int baseIndex)
+{
+    // Compute this thread's output address
+    unsigned int address = baseIndex + threadIdx.x +
+        __mul24(blockIdx.x, (blockDim.x << 1)); 
+
+    // note two adds per thread: one in first half of the block, one in last
+    d_vector[address]              += constant;
+    d_vector[address + blockDim.x] += (threadIdx.x + blockDim.x < n) * constant;
+}
+
+ /** @brief Add a uniform value to each data element of an array
+  *
+  * This function reads one value per CTA from \a d_uniforms into shared
+  * memory and adds that value to all values "owned" by the CTA in \a
+  * d_vector.  Each thread adds two pairs of values.
+  *
+  * @param[out] d_vector The d_vector whose values will have the uniform added
+  * @param[in] d_uniforms The array of uniform values (one per CTA)
+  * @param[in] numElements The number of elements in \a d_vector to process
+  * @param[in] blockOffset an optional offset to the beginning of this block's
+  * data.
+  * @param[in] baseIndex an optional offset to the beginning of the array 
+  * within \a d_vector.
+  */
+template <class T>
+__global__ void vectorAddUniform(T       *d_vector, 
+                                 const T *d_uniforms, 
+                                 int     numElements, 
+                                 int     blockOffset, 
+                                 int     baseIndex)
+{
+    __shared__ T uni;
+    // Get this block's uniform value from the uniform array in device memory
+    // We store it in shared memory so that the hardware's shared memory 
+    // broadcast capability can be used to share among all threads in each warp
+    // in a single cycle
+    if (threadIdx.x == 0)
+    {
+        uni = d_uniforms[blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset];
+    }
+
+    // Compute this thread's output address
+    int width = __mul24(gridDim.x,(blockDim.x << 1));
+
+    unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+        + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 1)); 
+
+    __syncthreads();
+
+    // note two adds per thread: one in first half of the block, one in last
+    d_vector[address]              += uni;
+    if (threadIdx.x + blockDim.x < numElements) d_vector[address + blockDim.x] += uni;
+}
+
+
+/** @brief Add a uniform value to each data element of an array (vec4 version)
+  *
+  * This function reads one value per CTA from \a d_uniforms into shared
+  * memory and adds that value to all values "owned" by the CTA in \a d_vector.  
+  * Each thread adds the uniform value to eight values in \a d_vector.
+  *
+  * @param[out] d_vector The d_vector whose values will have the uniform added
+  * @param[in] d_uniforms The array of uniform values (one per CTA)
+  * @param[in] numElements The number of elements in \a d_vector to process
+  * @param[in] vectorRowPitch For 2D arrays, the pitch (in elements) of the 
+  * rows of \a d_vector.
+  * @param[in] uniformRowPitch For 2D arrays, the pitch (in elements) of the 
+  * rows of \a d_uniforms.
+  * @param[in] blockOffset an optional offset to the beginning of this block's
+  * data.
+  * @param[in] baseIndex an optional offset to the beginning of the array 
+  * within \a d_vector.
+  */
+template <class T, CUDPPOperator op, int elementsPerThread>
+__global__ void vectorAddUniform4(T       *d_vector, 
+                                  const T *d_uniforms, 
+                                  int      numElements,             
+                                  int      vectorRowPitch,     // width of input array in elements
+                                  int      uniformRowPitch,    // width of uniform array in elements
+                                  int      blockOffset, 
+                                  int      baseIndex)
+{
+    __shared__ T uni;
+    // Get this block's uniform value from the uniform array in device memory
+    // We store it in shared memory so that the hardware's shared memory 
+    // broadcast capability can be used to share among all threads in each warp
+    // in a single cycle
+    if (threadIdx.x == 0)
+    {
+        uni = d_uniforms[blockIdx.x + __umul24(uniformRowPitch, blockIdx.y) + blockOffset];
+    }
+
+    // Compute this thread's output address
+    //int width = __mul24(gridDim.x,(blockDim.x << 1));
+   
+    unsigned int address = baseIndex + __umul24(vectorRowPitch, blockIdx.y)
+        + threadIdx.x + __umul24(blockIdx.x, (blockDim.x * elementsPerThread)); 
+    numElements += __umul24(vectorRowPitch, blockIdx.y);
+
+    __syncthreads();
+
+    switch (op)
+    {
+    case CUDPP_ADD:
+        for (int i = 0; i < elementsPerThread && address < numElements; i++)
+        {
+            d_vector[address] += uni;
+            address += blockDim.x;
+        }
+        break;
+
+    case CUDPP_MULTIPLY:
+        for (int i = 0; i < elementsPerThread && address < numElements; i++)
+        {
+            d_vector[address] *= uni;
+            address += blockDim.x;
+        }
+        break;
+
+    case CUDPP_MAX:
+        for (int i = 0; i < elementsPerThread && address < numElements; i++)
+        {
+            d_vector[address] = max(d_vector[address], uni);
+            address += blockDim.x;
+        }
+        break;
+
+    case CUDPP_MIN:
+        for (int i = 0; i < elementsPerThread && address < numElements; i++)
+        {
+            d_vector[address] = min(d_vector[address], uni);
+            address += blockDim.x;
+        }
+        break;
+    default:
+        break;
+    }    
+}
+
+/** @brief Adds together two vectors
+ *  
+ * Each thread adds two pairs of elements.
+ * @todo Test this function -- it is currently not yet used.
+ *
+ * @param[out] d_vectorA The left operand array and the result
+ * @param[in] d_vectorB The right operand array
+ * @param[in] numElements The number of elements in the vectors to be added.
+ * @param[in] baseIndex An optional offset to the beginning of the
+ * elements in the input arrays to be processed
+ */
+template <class T>
+__global__ void vectorAddVector(T       *d_vectorA,        // A += B
+                                const T *d_vectorB,
+                                int     numElements,
+                                int     baseIndex)
+{
+    // Compute this thread's output address
+    unsigned int address = baseIndex + threadIdx.x +
+        __mul24(blockIdx.x, (blockDim.x << 1)); 
+
+    // note two adds per thread: one in first half of the block, one in last
+    d_vectorA[address]              += d_vectorB[address];
+    d_vectorA[address + blockDim.x] += 
+        (threadIdx.x + blockDim.x < numElements) * d_vectorB[address];
+}
+
+/** @brief Add a uniform value to data elements of an array (vec4 version)
+  *
+  * This function reads one value per CTA from \a d_uniforms into shared
+  * memory and adds that value to values "owned" by the CTA in \a d_vector.
+  * The uniform value is added to only those values "owned" by the CTA which
+  * have an index less than d_maxIndex. If d_maxIndex for that CTA is UINT_MAX
+  * it adds the uniform to all values "owned" by the CTA.
+  * Each thread adds the uniform value to eight values in \a d_vector.
+  *
+  * @param[out] d_vector The d_vector whose values will have the uniform added
+  * @param[in] d_uniforms The array of uniform values (one per CTA)
+  * @param[in] d_maxIndices The array of maximum indices (one per CTA). This is
+  *            index upto which the uniform would be added. If this is UINT_MAX
+  *            the uniform is added to all elements of the CTA. This index is
+  *            1-based.
+  * @param[in] numElements The number of elements in \a d_vector to process
+  * @param[in] blockOffset an optional offset to the beginning of this block's
+  * data.
+  * @param[in] baseIndex an optional offset to the beginning of the array 
+  * within \a d_vector.
+  */
+template <class T, CUDPPOperator oper, bool isLastBlockFull>
+__global__ void vectorSegmentedAddUniform4(T                  *d_vector, 
+                                           const T            *d_uniforms, 
+                                           const unsigned int *d_maxIndices,
+                                           unsigned int       numElements,
+                                           int                blockOffset, 
+                                           int                baseIndex)
+{
+    __shared__ T uni[2];
+
+    unsigned int blockAddress = 
+        blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
+
+    // Get this block's uniform value from the uniform array in device memory
+    // We store it in shared memory so that the hardware's shared memory 
+    // broadcast capability can be used to share among all threads in each warp
+    // in a single cycle
+    
+    if (threadIdx.x == 0)
+    {
+        if (blockAddress > 0)
+            uni[0] = d_uniforms[blockAddress-1];
+        else
+            uni[0] = Operator<T, oper>::identity(); 
+        
+        // Tacit assumption that T is four-byte wide
+        uni[1] = (T)(d_maxIndices[blockAddress]);
+    }
+
+    // Compute this thread's output address
+    int width = __mul24(gridDim.x,(blockDim.x << 1));
+
+    unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+                           + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3)); 
+
+    __syncthreads();
+
+    unsigned int maxIndex = (unsigned int)(uni[1]);
+
+    bool isLastBlock = (blockIdx.x == (gridDim.x-1));
+     
+    if (maxIndex < UINT_MAX)
+    {
+        // Since maxIndex is a 1 based index
+        --maxIndex;
+        bool leftLess = address < maxIndex;
+        bool rightLess = (address + 7 * blockDim.x) < maxIndex;
+
+        if (leftLess)
+        {
+            if (rightLess)
+            {
+                for (unsigned int i = 0; i < 8; ++i)
+                    d_vector[address + i * blockDim.x] = 
+                        Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
+            }
+            else
+            {
+                for (unsigned int i=0; i < 8; ++i)
+                {
+                    if (address < maxIndex)
+                        d_vector[address] = 
+                            Operator<T, oper>::op(d_vector[address], uni[0]);
+
+                    address += blockDim.x;
+                }
+            }
+        }
+    }
+    else
+    {
+        if (!isLastBlockFull && isLastBlock)
+        {
+            for (unsigned int i = 0; i < 8; ++i)
+            {
+                if (address < numElements)
+                    d_vector[address] = 
+                        Operator<T, oper>::op(d_vector[address], uni[0]);
+                
+                address += blockDim.x;
+            }
+        }
+        else
+        {
+            for (unsigned int i=0; i<8; ++i)
+            {
+                d_vector[address] = 
+                    Operator<T, oper>::op(d_vector[address], uni[0]);
+                
+                address += blockDim.x;
+            }            
+        }
+    }
+}
+
+/** @brief Add a uniform value to data elements of an array (vec4 version)
+  *
+  * This function reads one value per CTA from \a d_uniforms into shared
+  * memory and adds that value to values "owned" by the CTA in \a d_vector.
+  * The uniform value is added to only those values "owned" by the CTA which
+  * have an index greater than d_minIndex. If d_minIndex for that CTA is 0
+  * it adds the uniform to all values "owned" by the CTA.
+  * Each thread adds the uniform value to eight values in \a d_vector.
+  *
+  * @param[out] d_vector The d_vector whose values will have the uniform added
+  * @param[in] d_uniforms The array of uniform values (one per CTA)
+  * @param[in] d_minIndices The array of minimum indices (one per CTA). The
+  *            uniform is added to the right of this index (that is, to every index
+  *            that is greater than this index). If this is 0, the uniform is 
+  *            added to all elements of the CTA. This index is 1-based to
+  *            prevent overloading of what 0 means. In our case it means
+  *            absence of a flag. But if the first element of a CTA has
+  *            flag the index will also be 0. Hence we use 1-based indices
+  *            so the index is 1 in the latter case.
+  * @param[in] numElements The number of elements in \a d_vector to process
+  * @param[in] blockOffset an optional offset to the beginning of this block's
+  * data.
+  * @param[in] baseIndex an optional offset to the beginning of the array 
+  * within \a d_vector.
+  *
+  */
+template <class T, CUDPPOperator oper, bool isLastBlockFull>
+__global__ void vectorSegmentedAddUniformToRight4(T                  *d_vector, 
+                                                  const T            *d_uniforms, 
+                                                  const unsigned int *d_minIndices,
+                                                  unsigned int       numElements,
+                                                  int                blockOffset, 
+                                                  int                baseIndex)
+{
+    __shared__ T uni[2];
+
+    unsigned int blockAddress = 
+        blockIdx.x + __mul24(gridDim.x, blockIdx.y) + blockOffset;
+
+    // Get this block's uniform value from the uniform array in device memory
+    // We store it in shared memory so that the hardware's shared memory 
+    // broadcast capability can be used to share among all threads in each warp
+    // in a single cycle
+    
+    if (threadIdx.x == 0)
+    {
+        // FIXME - blockAddress test here is incompatible with how it is calculated
+        // above
+        if (blockAddress < (gridDim.x-1))
+            uni[0] = d_uniforms[blockAddress+1];
+        else
+            uni[0] = Operator<T, oper>::identity(); 
+        
+        // Tacit assumption that T is four-byte wide
+        uni[1] = (T)(d_minIndices[blockAddress]);
+    }
+
+    // Compute this thread's output address
+    int width = __mul24(gridDim.x,(blockDim.x << 1));
+
+    unsigned int address = baseIndex + __mul24(width, blockIdx.y)
+                           + threadIdx.x + __mul24(blockIdx.x, (blockDim.x << 3)); 
+
+    __syncthreads();
+
+    unsigned int minIndex = (unsigned int)(uni[1]);
+
+    bool isLastBlock = (blockIdx.x == (gridDim.x-1));
+     
+    if (minIndex > 0)
+    {
+        // Since minIndex is a 1 based index
+        --minIndex;
+        bool leftInRange = address > minIndex;
+        bool rightInRange = (address + 7 * blockDim.x) > minIndex;
+
+        if (rightInRange)
+        {
+            if (leftInRange)
+            {
+                for (unsigned int i = 0; i < 8; ++i)
+                    d_vector[address + i * blockDim.x] = 
+                        Operator<T, oper>::op(d_vector[address + i * blockDim.x], uni[0]);
+            }
+            else
+            {
+                for (unsigned int i=0; i < 8; ++i)
+                {
+                    if (address > minIndex)
+                        d_vector[address] = 
+                            Operator<T, oper>::op(d_vector[address], uni[0]);
+
+                    address += blockDim.x;
+                }
+            }
+        }
+    }
+    else
+    {
+        if (!isLastBlockFull && isLastBlock)
+        {
+            for (unsigned int i = 0; i < 8; ++i)
+            {
+                if (address < numElements)
+                    d_vector[address] = 
+                        Operator<T, oper>::op(d_vector[address], uni[0]);
+                
+                address += blockDim.x;
+            }
+        }
+        else
+        {
+            for (unsigned int i=0; i<8; ++i)
+            {
+                d_vector[address] = 
+                    Operator<T, oper>::op(d_vector[address], uni[0]);
+                
+                address += blockDim.x;
+            }            
+        }
+    }
+}
+
+/** @} */ // end d_vector functions
+/** @} */ // end cudpp_kernel
--- a/lib/gpu/cudpp_mini/license.txt
+++ b/lib/gpu/cudpp_mini/license.txt
@ -0,0 +1,25 @@
+Copyright (c) 2007-2010 The Regents of the University of California, Davis
+campus ("The Regents") and NVIDIA Corporation ("NVIDIA"). All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, 
+are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, 
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, 
+      this list of conditions and the following disclaimer in the documentation 
+      and/or other materials provided with the distribution.
+    * Neither the name of the The Regents, nor NVIDIA, nor the names of its 
+      contributors may be used to endorse or promote products derived from this 
+      software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/lib/gpu/cudpp_mini/radixsort_app.cu
+++ b/lib/gpu/cudpp_mini/radixsort_app.cu
@ -0,0 +1,993 @@
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * radixsort_app.cu
+ *   
+ * @brief CUDPP application-level radix sorting routines
+ */
+
+/** @addtogroup cudpp_app 
+ * @{
+ */
+
+/** @name RadixSort Functions
+ * @{
+ */
+ 
+
+#include "cudpp.h"
+#include "cudpp_util.h"
+#include "cudpp_radixsort.h"
+#include "cudpp_scan.h"
+#include "kernel/radixsort_kernel.cu"
+
+#include <cutil.h>
+#include <cstdlib>
+#include <cstdio>
+#include <assert.h>
+
+typedef unsigned int uint;
+
+/** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step, 
+* starting at startbit.
+* 
+* Uses cudppScanDispatch() for the prefix sum of radix counters.
+* 
+* @param[in,out] keys Keys to be sorted.
+* @param[in,out] values Associated values to be sorted (through keys).
+* @param[in] plan Configuration information for RadixSort.
+* @param[in] numElements Number of elements in the sort.
+**/
+template<uint nbits, uint startbit, bool flip, bool unflip>
+void radixSortStep(uint *keys, 
+                   uint *values, 
+                   const CUDPPRadixSortPlan *plan,
+                   uint numElements)
+{
+    const uint eltsPerBlock = SORT_CTA_SIZE * 4;
+    const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
+
+    bool fullBlocks = ((numElements % eltsPerBlock) == 0);
+    uint numBlocks = (fullBlocks) ? 
+        (numElements / eltsPerBlock) : 
+    (numElements / eltsPerBlock + 1);
+    uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
+        (numElements / eltsPerBlock2) : 
+    (numElements / eltsPerBlock2 + 1);
+
+    bool loop = numBlocks > 65535;
+    uint blocks = loop ? 65535 : numBlocks;
+    uint blocksFind = loop ? 65535 : numBlocks2;
+    uint blocksReorder = loop ? 65535 : numBlocks2;
+
+    uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[0] : plan->m_persistentCTAThreshold[0];
+
+    bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
+
+    if (persist)
+    {
+        loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
+        
+        blocks = numBlocks;
+        blocksFind = numBlocks2;
+        blocksReorder = numBlocks2;
+
+        // Run an empty kernel -- this seems to reset some of the CTA scheduling hardware
+        // on GT200, resulting in better scheduling and lower run times
+        if (startbit > 0)
+        {
+            emptyKernel<<<numCTAs(emptyKernel), SORT_CTA_SIZE>>>();
+        }
+    }
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip? numCTAs(radixSortBlocks<4, 0, true, true, true>) : 
+                               numCTAs(radixSortBlocks<4, 0, true, false, true>);
+            }
+
+            radixSortBlocks<nbits, startbit, true, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+        else
+        {
+            radixSortBlocks<nbits, startbit, true, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip ? numCTAs(radixSortBlocks<4, 0, false, true, true>) : 
+                                numCTAs(radixSortBlocks<4, 0, false, false, true>);
+            }
+
+            radixSortBlocks<nbits, startbit, false, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+        else
+        {
+            radixSortBlocks<nbits, startbit, false, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)plan->m_tempValues, (uint4*)keys, (uint4*)values, numElements, numBlocks);
+        }
+    }
+
+    CUT_CHECK_ERROR("radixSortBlocks");
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, true, true>);
+            }
+            findRadixOffsets<startbit, true, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+        {
+            findRadixOffsets<startbit, true, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, false, true>);
+            }
+            findRadixOffsets<startbit, false, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+        {
+            findRadixOffsets<startbit, false, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+    }
+
+    CUT_CHECK_ERROR("findRadixOffsets");
+
+    cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
+
+    if (fullBlocks)
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? numCTAs(reorderData<0, true, true, true, true>) :
+                                             numCTAs(reorderData<0, true, true, false, true>);
+                }
+                reorderData<startbit, true, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, true, true, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? numCTAs(reorderData<0, true, false, true, true>) :
+                                             numCTAs(reorderData<0, true, false, false, true>);
+                }
+                reorderData<startbit, true, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, true, false, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+    }
+    else
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? 
+                        numCTAs(reorderData<0, false, true, true, true>) :
+                        numCTAs(reorderData<0, false, true, false, true>);
+                }
+                reorderData<startbit, false, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, false, true, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ?
+                        numCTAs(reorderData<0, false, false, true, true>) :
+                        numCTAs(reorderData<0, false, false, false, true>);
+                }
+                reorderData<startbit, false, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+            else
+            {
+                reorderData<startbit, false, false, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, values, (uint2*)plan->m_tempKeys, (uint2*)plan->m_tempValues, 
+                    plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, numElements, numBlocks2);
+            }
+        }
+    }
+
+    CUT_CHECK_ERROR("radixSortStep");
+}
+
+/**
+ * @brief Single-block optimization for sorts of fewer than 4 * CTA_SIZE elements
+ * 
+ * @param[in,out] keys  Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param numElements Number of elements in the sort.
+**/
+template <bool flip>
+void radixSortSingleBlock(uint *keys, 
+                          uint *values, 
+                          uint numElements)
+{
+    bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
+    if (fullBlocks)
+    {
+        radixSortBlocks<32, 0, true, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)keys, (uint4*)values, 
+                 (uint4*)keys, (uint4*)values, 
+                 numElements, 0);
+    }
+    else
+    {
+        radixSortBlocks<32, 0, false, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)keys, (uint4*)values, 
+                 (uint4*)keys, (uint4*)values, 
+                 numElements, 0);
+    }
+
+    if (flip) unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
+
+    CUT_CHECK_ERROR("radixSortSingleBlock");
+}
+
+/**
+ * @brief Main radix sort function
+ * 
+ * Main radix sort function.  Sorts in place in the keys and values arrays,
+ * but uses the other device arrays as temporary storage.  All pointer 
+ * parameters are device pointers.  Uses cudppScan() for the prefix sum of 
+ * radix counters.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] flipBits Is set true if key datatype is a float 
+ *                 (neg. numbers) for special float sorting operations.
+ * @param[in] keyBits Number of interesting bits in the key
+ **/
+void radixSort(uint *keys,                         
+               uint* values,               
+               const CUDPPRadixSortPlan *plan,               
+               size_t numElements,
+               bool flipBits,
+               int keyBits)
+{
+    if(numElements <= WARP_SIZE)
+    {
+        if (flipBits)
+            radixSortSingleWarp<true><<<1, numElements>>>
+                (keys, values, numElements);
+        else
+            radixSortSingleWarp<false><<<1, numElements>>>
+                (keys, values, numElements);
+
+        CUT_CHECK_ERROR("radixSortSingleWarp");        
+        return;
+    }
+#ifdef __DEVICE_EMULATION__
+    printf("bits: %d\n", keyBits);
+#endif
+    
+    if(numElements <= SORT_CTA_SIZE * 4)
+    {
+        if (flipBits)
+            radixSortSingleBlock<true>(keys, values, numElements);
+        else
+            radixSortSingleBlock<false>(keys, values, numElements);
+        return;
+    }
+        
+    // flip float bits on the first pass, unflip on the last pass    
+    if (flipBits) 
+    {               
+        radixSortStep<4,  0, true, false>
+            (keys, values, plan, numElements);            
+    }
+    else
+    {     
+        radixSortStep<4,  0, false, false>
+            (keys, values, plan, numElements);           
+    }
+
+    if (keyBits > 4)
+    {                   
+        radixSortStep<4,  4, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 8)
+    {                                   
+        radixSortStep<4,  8, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 12)
+    {                   
+        radixSortStep<4, 12, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 16)
+    {                   
+        radixSortStep<4, 16, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 20)
+    {                   
+        radixSortStep<4, 20, false, false>
+            (keys, values, plan, numElements);            
+    }
+    if (keyBits > 24)
+    {                   
+        radixSortStep<4, 24, false, false>
+            (keys, values, plan, numElements);         
+    }
+    if (keyBits > 28)
+    {
+        if (flipBits) // last pass
+        {                       
+            radixSortStep<4, 28, false, true>
+                (keys, values, plan, numElements);
+        }
+        else
+        {                       
+            radixSortStep<4, 28, false, false>
+                (keys, values, plan, numElements);            
+        }
+    }
+}
+
+/**
+ * @brief Wrapper to call main radix sort function. For float configuration.
+ * 
+ * Calls the main radix sort function. For float configuration.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] negativeKeys Is set true if key datatype has neg. numbers.
+ * @param[in] keyBits Number of interesting bits in the key
+ **/
+extern "C"
+void radixSortFloatKeys(float* keys, 
+                        uint* values, 
+                        const CUDPPRadixSortPlan *plan,
+                        size_t numElements,            
+                        bool  negativeKeys,
+                        int keyBits)
+{
+   
+    radixSort((uint*)keys, (uint*)values, plan, 
+              numElements, negativeKeys, keyBits);
+}
+
+/** @brief Perform one step of the radix sort.  Sorts by nbits key bits per step, 
+ * starting at startbit.
+ * 
+ * @param[in,out] keys  Keys to be sorted.
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] numElements Number of elements in the sort. 
+**/
+template<uint nbits, uint startbit, bool flip, bool unflip>
+void radixSortStepKeysOnly(uint *keys, 
+                           const CUDPPRadixSortPlan *plan,
+                           uint numElements)
+{
+    const uint eltsPerBlock = SORT_CTA_SIZE * 4;
+    const uint eltsPerBlock2 = SORT_CTA_SIZE * 2;
+
+    bool fullBlocks = ((numElements % eltsPerBlock) == 0);
+    uint numBlocks = (fullBlocks) ? 
+        (numElements / eltsPerBlock) : 
+    (numElements / eltsPerBlock + 1);
+    uint numBlocks2 = ((numElements % eltsPerBlock2) == 0) ?
+        (numElements / eltsPerBlock2) : 
+    (numElements / eltsPerBlock2 + 1);
+
+    bool loop = numBlocks > 65535;
+    
+    uint blocks = loop ? 65535 : numBlocks;
+    uint blocksFind = loop ? 65535 : numBlocks2;
+    uint blocksReorder = loop ? 65535 : numBlocks2;
+
+    uint threshold = fullBlocks ? plan->m_persistentCTAThresholdFullBlocks[1] : plan->m_persistentCTAThreshold[1];
+
+    bool persist = plan->m_bUsePersistentCTAs && (numElements >= threshold);
+
+    if (persist)
+    {
+        loop = (numElements > 262144) || (numElements >= 32768 && numElements < 65536);
+        
+        blocks = numBlocks;
+        blocksFind = numBlocks2;
+        blocksReorder = numBlocks2;
+    }
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>) : 
+                                numCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>);
+            }
+
+            radixSortBlocksKeysOnly<nbits, startbit, true, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+        }
+        else
+            radixSortBlocksKeysOnly<nbits, startbit, true, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocks = flip ? numCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>) : 
+                                numCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>);
+            }
+
+            radixSortBlocksKeysOnly<nbits, startbit, false, flip, true>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+        }
+        else
+            radixSortBlocksKeysOnly<nbits, startbit, false, flip, false>
+                <<<blocks, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint4*)plan->m_tempKeys, (uint4*)keys, numElements, numBlocks);
+
+    }
+
+    if (fullBlocks)
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, true, true>);
+            }
+            findRadixOffsets<startbit, true, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+            findRadixOffsets<startbit, true, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+    }
+    else
+    {
+        if (loop)
+        {
+            if (persist) 
+            {
+                blocksFind = numCTAs(findRadixOffsets<0, false, true>);
+            }
+            findRadixOffsets<startbit, false, true>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+        }
+        else
+            findRadixOffsets<startbit, false, false>
+                <<<blocksFind, SORT_CTA_SIZE, 3 * SORT_CTA_SIZE * sizeof(uint)>>>
+                ((uint2*)plan->m_tempKeys, plan->m_counters, plan->m_blockOffsets, numElements, numBlocks2);
+
+    }
+
+    cudppScanDispatch(plan->m_countersSum, plan->m_counters, 16*numBlocks2, 1, plan->m_scanPlan);
+
+    if (fullBlocks)
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? 
+                        numCTAs(reorderDataKeysOnly<0, true, true, true, true>) : 
+                        numCTAs(reorderDataKeysOnly<0, true, true, false, true>);
+                }
+                reorderDataKeysOnly<startbit, true, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, true, true, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                     numElements, numBlocks2);
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ?
+                        numCTAs(reorderDataKeysOnly<0, true, false, true, true>) :
+                        numCTAs(reorderDataKeysOnly<0, true, false, false, true>);
+                }
+                reorderDataKeysOnly<startbit, true, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, true, false, unflip, false>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                     numElements, numBlocks2);
+        }
+    }
+    else
+    {
+        if (plan->m_bManualCoalesce)
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ? 
+                        numCTAs(reorderDataKeysOnly<0, false, true, true, true>) :
+                        numCTAs(reorderDataKeysOnly<0, false, true, false, true>);
+                }
+                reorderDataKeysOnly<startbit, false, true, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, false, true, unflip, false>
+                <<<blocksReorder, SORT_CTA_SIZE>>>
+                (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                numElements, numBlocks2);
+        }
+        else
+        {
+            if (loop)
+            {
+                if (persist) 
+                {
+                    blocksReorder = unflip ?
+                        numCTAs(reorderDataKeysOnly<0, false, false, true, true>) :
+                        numCTAs(reorderDataKeysOnly<0, false, false, false, true>);
+                }
+                reorderDataKeysOnly<startbit, false, false, unflip, true>
+                    <<<blocksReorder, SORT_CTA_SIZE>>>
+                    (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                    numElements, numBlocks2);
+            }
+            else
+                reorderDataKeysOnly<startbit, false, false, unflip, false>
+                <<<blocksReorder, SORT_CTA_SIZE>>>
+                (keys, (uint2*)plan->m_tempKeys, plan->m_blockOffsets, plan->m_countersSum, plan->m_counters, 
+                numElements, numBlocks2);
+        }
+    }
+
+    CUT_CHECK_ERROR("radixSortStepKeysOnly");
+}
+
+/**
+ * @brief Optimization for sorts of fewer than 4 * CTA_SIZE elements (keys only).
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param numElements Number of elements in the sort.
+**/
+template <bool flip>
+void radixSortSingleBlockKeysOnly(uint *keys, 
+                                  uint numElements)
+{
+    bool fullBlocks = (numElements % (SORT_CTA_SIZE * 4) == 0);
+    if (fullBlocks)
+    {
+        radixSortBlocksKeysOnly<32, 0, true, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+            ((uint4*)keys, (uint4*)keys, numElements, 1 );
+    }
+    else
+    {
+        radixSortBlocksKeysOnly<32, 0, false, flip, false>
+            <<<1, SORT_CTA_SIZE, 4 * SORT_CTA_SIZE * sizeof(uint)>>>
+            ((uint4*)keys, (uint4*)keys, numElements, 1 );
+    }
+
+    if (flip)
+        unflipFloats<<<1, SORT_CTA_SIZE>>>(keys, numElements);
+
+
+    CUT_CHECK_ERROR("radixSortSingleBlock");
+}
+
+/** 
+ * @brief Main radix sort function. For keys only configuration.
+ *
+ * Main radix sort function.  Sorts in place in the keys array,
+ * but uses the other device arrays as temporary storage.  All pointer 
+ * parameters are device pointers.  Uses scan for the prefix sum of
+ * radix counters.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] flipBits Is set true if key datatype is a float (neg. numbers) 
+ *        for special float sorting operations.
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] keyBits Number of interesting bits in the key
+**/
+extern "C"
+void radixSortKeysOnly(uint *keys,
+                       const CUDPPRadixSortPlan *plan, 
+                       bool flipBits, 
+                       size_t numElements,
+                       int keyBits)
+{
+
+    if(numElements <= WARP_SIZE)
+    {
+        if (flipBits)
+            radixSortSingleWarpKeysOnly<true><<<1, numElements>>>(keys, numElements);
+        else
+            radixSortSingleWarpKeysOnly<false><<<1, numElements>>>(keys, numElements);
+        return;
+    }
+    if(numElements <= SORT_CTA_SIZE * 4)
+    {
+        if (flipBits)
+            radixSortSingleBlockKeysOnly<true>(keys, numElements);
+        else
+            radixSortSingleBlockKeysOnly<false>(keys, numElements);
+        return;
+    }
+
+    // flip float bits on the first pass, unflip on the last pass
+    if (flipBits) 
+    {
+        radixSortStepKeysOnly<4,  0, true, false>(keys, plan, numElements);
+    }
+    else
+    {
+        radixSortStepKeysOnly<4,  0, false, false>(keys, plan, numElements);
+    }
+
+    if (keyBits > 4)
+    {
+        radixSortStepKeysOnly<4,  4, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 8)
+    {
+        radixSortStepKeysOnly<4,  8, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 12)
+    {
+        radixSortStepKeysOnly<4, 12, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 16)
+    {
+        radixSortStepKeysOnly<4, 16, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 20)
+    {
+        radixSortStepKeysOnly<4, 20, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 24)
+    {
+       radixSortStepKeysOnly<4, 24, false, false>(keys, plan, numElements);
+    }
+    if (keyBits > 28)
+    {
+        if (flipBits) // last pass
+        {
+            radixSortStepKeysOnly<4, 28, false, true>(keys, plan, numElements);
+        }
+        else
+        {
+            radixSortStepKeysOnly<4, 28, false, false>(keys, plan, numElements);
+        }
+    }
+}
+
+/**
+ * @brief Wrapper to call main radix sort function. For floats and keys only.
+ *
+ * Calls the radixSortKeysOnly function setting parameters for floats.
+ * 
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in] plan Configuration information for RadixSort.
+ * @param[in] negativeKeys Is set true if key flipBits is to be true in 
+ *                     radixSortKeysOnly().
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] keyBits Number of interesting bits in the key
+**/
+extern "C"
+void radixSortFloatKeysOnly(float *keys, 
+                            const CUDPPRadixSortPlan *plan,                        
+                            bool  negativeKeys,
+                            size_t numElements,
+                            int keyBits)
+{
+    radixSortKeysOnly((uint*)keys, plan, negativeKeys, numElements, keyBits);
+}
+
+extern "C"
+void initDeviceParameters(CUDPPRadixSortPlan *plan)
+{
+    int deviceID = -1;
+    if (cudaSuccess == cudaGetDevice(&deviceID))
+    {
+        cudaDeviceProp devprop;
+        cudaGetDeviceProperties(&devprop, deviceID);
+
+        int smVersion = devprop.major * 10 + devprop.minor;
+
+        // sm_12 and later devices don't need help with coalesce in reorderData kernel
+        plan->m_bManualCoalesce = (smVersion < 12);
+
+        // sm_20 and later devices are better off not using persistent CTAs
+        plan->m_bUsePersistentCTAs = (smVersion < 20);
+
+        if (plan->m_bUsePersistentCTAs)
+        {
+            // The following is only true on pre-sm_20 devices (pre-Fermi):
+            // Empirically we have found that for some (usually larger) sort
+            // sizes it is better to use exactly as many "persistent" CTAs 
+            // as can fill the GPU, which loop over the "blocks" of work. For smaller 
+            // arrays it is better to use the typical CUDA approach of launching one CTA
+            // per block of work.
+            // 0-element of these two-element arrays is for key-value sorts
+            // 1-element is for key-only sorts
+            plan->m_persistentCTAThreshold[0] = plan->m_bManualCoalesce ? 16777216 : 524288;
+            plan->m_persistentCTAThresholdFullBlocks[0] = plan->m_bManualCoalesce ? 2097152: 524288;
+            plan->m_persistentCTAThreshold[1] = plan->m_bManualCoalesce ? 16777216 : 8388608;
+            plan->m_persistentCTAThresholdFullBlocks[1] = plan->m_bManualCoalesce ? 2097152: 0;
+
+            // create a map of function pointers to register counts for more accurate occupancy calculation
+            // Must pass in the dynamic shared memory used by each kernel, since the runtime doesn't know it
+            // Note we only insert the "loop" version of the kernels (the one with the last template param = true)
+            // Because those are the only ones that require persistent CTAs that maximally fill the device.
+            computeNumCTAs(radixSortBlocks<4, 0, false, false, true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocks<4, 0, false, true,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocks<4, 0, true, false,  true>,         4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocks<4, 0, true, true,  true>,          4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, false, true>, 4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, false, true, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, false, true>,  4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(radixSortBlocksKeysOnly<4, 0, true, true, true>,   4 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+
+            computeNumCTAs(findRadixOffsets<0, false, true>,                  3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+            computeNumCTAs(findRadixOffsets<0, true, true>,                   3 * SORT_CTA_SIZE * sizeof(uint), SORT_CTA_SIZE);
+
+            computeNumCTAs(reorderData<0, false, false, false, true>,         0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, false, false, true, true>,          0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, false, true, false, true>,          0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, false, true, true, true>,           0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, false, false, true>,          0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, false, true, true>,           0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, true, false, true>,           0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderData<0, true, true, true, true>,            0,                                SORT_CTA_SIZE);
+
+            computeNumCTAs(reorderDataKeysOnly<0, false, false, false, true>, 0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, false, false, true, true>,  0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, false, true, false, true>,  0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, false, true, true, true>,   0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, false, false, true>,  0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, false, true, true>,   0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, true, false, true>,   0,                                SORT_CTA_SIZE);
+            computeNumCTAs(reorderDataKeysOnly<0, true, true, true, true>,    0,                                SORT_CTA_SIZE);
+                   
+            computeNumCTAs(emptyKernel,                                       0,                                SORT_CTA_SIZE);
+        }
+    }
+}
+
+/**
+ * @brief From the programmer-specified sort configuration, 
+ *        creates internal memory for performing the sort.
+ * 
+ * @param[in] plan Pointer to CUDPPRadixSortPlan object
+**/
+extern "C"
+void allocRadixSortStorage(CUDPPRadixSortPlan *plan)
+{               
+        
+    unsigned int numElements = plan->m_numElements;
+
+    unsigned int numBlocks = 
+        ((numElements % (SORT_CTA_SIZE * 4)) == 0) ? 
+            (numElements / (SORT_CTA_SIZE * 4)) : 
+            (numElements / (SORT_CTA_SIZE * 4) + 1);
+                        
+    switch(plan->m_config.datatype)
+    {
+    case CUDPP_UINT:
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys, 
+                                  numElements * sizeof(unsigned int)));
+
+        if (!plan->m_bKeysOnly)
+            CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues, 
+                           numElements * sizeof(unsigned int)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters, 
+                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
+                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets, 
+                       WARP_SIZE * numBlocks * sizeof(unsigned int)));
+    break;
+
+    case CUDPP_FLOAT:
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempKeys,
+                                   numElements * sizeof(float)));
+
+        if (!plan->m_bKeysOnly)
+            CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_tempValues,
+                           numElements * sizeof(float)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_counters,
+                       WARP_SIZE * numBlocks * sizeof(float)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_countersSum,
+                       WARP_SIZE * numBlocks * sizeof(float)));
+
+        CUDA_SAFE_CALL(cudaMalloc((void **)&plan->m_blockOffsets,
+                       WARP_SIZE * numBlocks * sizeof(float)));     
+    break;
+    }
+        
+    initDeviceParameters(plan);
+}
+
+/** @brief Deallocates intermediate memory from allocRadixSortStorage.
+ *
+ *
+ * @param[in] plan Pointer to CUDPPRadixSortPlan object
+**/
+extern "C"
+void freeRadixSortStorage(CUDPPRadixSortPlan* plan)
+{
+    CUDA_SAFE_CALL( cudaFree(plan->m_tempKeys));
+    CUDA_SAFE_CALL( cudaFree(plan->m_tempValues));
+    CUDA_SAFE_CALL( cudaFree(plan->m_counters));
+    CUDA_SAFE_CALL( cudaFree(plan->m_countersSum));
+    CUDA_SAFE_CALL( cudaFree(plan->m_blockOffsets));
+}
+
+/** @brief Dispatch function to perform a sort on an array with 
+ * a specified configuration.
+ *
+ * This is the dispatch routine which calls radixSort...() with 
+ * appropriate template parameters and arguments as specified by 
+ * the plan.
+ * @param[in,out] keys Keys to be sorted.
+ * @param[in,out] values Associated values to be sorted (through keys).
+ * @param[in] numElements Number of elements in the sort.
+ * @param[in] keyBits Number of interesting bits in the key*
+ * @param[in] plan Configuration information for RadixSort.
+**/
+extern "C"
+void cudppRadixSortDispatch(void  *keys,
+                            void  *values,
+                            size_t numElements,
+                            int   keyBits,
+                            const CUDPPRadixSortPlan *plan)
+{              
+    if(plan->m_bKeysOnly)
+    {
+        switch(plan->m_config.datatype)
+        {
+        case CUDPP_UINT:
+            radixSortKeysOnly((uint*)keys, plan, false, 
+                              numElements, keyBits);
+            break;
+        case CUDPP_FLOAT:
+            radixSortFloatKeysOnly((float*)keys, plan, true,
+                                    numElements, keyBits);
+        }
+    }
+    else
+    {
+        switch(plan->m_config.datatype)
+        {
+        case CUDPP_UINT:      
+            radixSort((uint*)keys, (uint*) values, plan, 
+                      numElements, false, keyBits);
+            break;
+        case CUDPP_FLOAT: 
+            radixSortFloatKeys((float*)keys, (uint*) values, plan, 
+                               numElements, true, keyBits);
+        }
+    }
+}                            
+
+/** @} */ // end radixsort functions
+/** @} */ // end cudpp_app
--- a/lib/gpu/cudpp_mini/scan_app.cu
+++ b/lib/gpu/cudpp_mini/scan_app.cu
@ -0,0 +1,771 @@
+// -------------------------------------------------------------
+// CUDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision: 5633 $
+// $Date: 2009-07-01 15:02:51 +1000 (Wed, 01 Jul 2009) $
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * scan_app.cu
+ *
+ * @brief CUDPP application-level scan routines
+ */
+
+/** \defgroup cudpp_app CUDPP Application-Level API
+  * The CUDPP Application-Level API contains functions
+  * that run on the host CPU and invoke GPU routines in 
+  * the CUDPP \link cudpp_kernel Kernel-Level API\endlink. 
+  * Application-Level API functions are used by
+  * CUDPP \link publicInterface Public Interface\endlink
+  * functions to implement CUDPP's core functionality.
+  * @{
+  */
+
+/** @name Scan Functions
+ * @{
+ */
+
+#include "cudpp.h"
+#include "cudpp_util.h"
+#include "cudpp_plan.h"
+#include "kernel/scan_kernel.cu"
+#include "kernel/vector_kernel.cu"
+
+
+#include <cutil.h>
+#include <cstdlib>
+#include <cstdio>
+#include <assert.h>
+
+/** @brief Perform recursive scan on arbitrary size arrays
+  *
+  * This is the CPU-side workhorse function of the scan engine.  This function
+  * invokes the CUDA kernels which perform the scan on individual blocks. 
+  *
+  * Scans of large arrays must be split (possibly recursively) into a hierarchy of block scans,
+  * where each block is scanned by a single CUDA thread block.  At each recursive level of the
+  * scanArrayRecursive first invokes a kernel to scan all blocks of that level, and if the level
+  * has more than one block, it calls itself recursively.  On returning from each recursive level,
+  * the total sum of each block from the level below is added to all elements of the corresponding
+  * block in this level.  See "Parallel Prefix Sum (Scan) in CUDA" for more information (see
+  * \ref references ).
+  * 
+  * Template parameter \a T is the datatype; \a isBackward specifies backward or forward scan; 
+  * \a isExclusive specifies exclusive or inclusive scan, and \a op specifies the binary associative
+  * operator to be used.
+  *
+  * @param[out] d_out       The output array for the scan results
+  * @param[in]  d_in        The input array to be scanned
+  * @param[out] d_blockSums Array of arrays of per-block sums (one array per recursive level, allocated
+  *                         by allocScanStorage())
+  * @param[in]  numElements The number of elements in the array to scan
+  * @param[in]  numRows The number of rows in the array to scan
+  * @param[in]  rowPitches  Array of row pitches (one array per recursive level, allocated by 
+  *                         allocScanStorage())
+  * @param[in]  level       The current recursive level of the scan
+  */
+template <class T, bool isBackward, bool isExclusive, CUDPPOperator op>
+void scanArrayRecursive(T                   *d_out, 
+                        const T             *d_in, 
+                        T                   **d_blockSums,
+                        size_t              numElements,
+                        size_t              numRows,
+                        const size_t        *rowPitches,
+                        int                 level)
+{
+    unsigned int numBlocks = 
+        max(1, (unsigned int)ceil((double)numElements / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
+
+    unsigned int sharedEltsPerBlock = CTA_SIZE * 2;
+      
+    unsigned int sharedMemSize = sizeof(T) * sharedEltsPerBlock;
+
+    // divide pitch by four since scan's load/store addresses are for vec4 elements
+    unsigned int rowPitch = 1;
+    unsigned int blockSumRowPitch = 1;
+
+    if (numRows > 1)
+    {
+        rowPitch         = rowPitches[level] / 4; 
+        blockSumRowPitch = (numBlocks > 1) ? rowPitches[level+1] / 4 : 0;
+    }
+
+    bool fullBlock = (numElements == numBlocks * SCAN_ELTS_PER_THREAD * CTA_SIZE);
+
+    // setup execution parameters
+    dim3  grid(numBlocks, numRows, 1); 
+    dim3  threads(CTA_SIZE, 1, 1);
+
+    // make sure there are no CUDA errors before we start
+    CUT_CHECK_ERROR("scanArray before kernels");
+
+    unsigned int traitsCode = 0;
+    if (numBlocks > 1) traitsCode |= 1;
+    if (numRows > 1)   traitsCode |= 2;
+    if (fullBlock)     traitsCode |= 4;
+
+    switch (traitsCode)
+    {
+    case 0: // single block, single row, non-full block
+        scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, false> >
+               <<< grid, threads, sharedMemSize >>>
+               (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
+        break;
+    case 1: // multiblock, single row, non-full block
+        scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, false> >
+               <<< grid, threads, sharedMemSize >>>
+               (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
+        break;
+    case 2: // single block, multirow, non-full block
+        scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, false> >
+                <<< grid, threads, sharedMemSize >>>
+                (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
+        break;
+    case 3: // multiblock, multirow, non-full block
+        scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, false> >
+                <<< grid, threads, sharedMemSize >>>
+                (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
+        break;
+    case 4: // single block, single row, full block
+        scan4<T, ScanTraits<T, op, isBackward, isExclusive, false, false, true> >
+               <<< grid, threads, sharedMemSize >>>
+               (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
+        break;
+    case 5: // multiblock, single row, full block
+        scan4< T, ScanTraits<T, op, isBackward, isExclusive, false, true, true> >
+               <<< grid, threads, sharedMemSize >>>
+               (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
+        break;
+    case 6: // single block, multirow, full block
+        scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, false, true> >
+                <<< grid, threads, sharedMemSize >>>
+                (d_out, d_in, 0, numElements, rowPitch, blockSumRowPitch);
+        break;
+    case 7: // multiblock, multirow, full block
+        scan4<T, ScanTraits<T, op, isBackward, isExclusive, true, true, true> >
+                <<< grid, threads, sharedMemSize >>>
+                (d_out, d_in, d_blockSums[level], numElements, rowPitch, blockSumRowPitch);
+        break;
+    }
+
+    CUT_CHECK_ERROR("prescan");
+
+    if (numBlocks > 1)
+    {
+        // After scanning all the sub-blocks, we are mostly done. But
+        // now we need to take all of the last values of the
+        // sub-blocks and scan those. This will give us a new value
+        // that must be sdded to each block to get the final results.
+
+        scanArrayRecursive<T, isBackward, true, op>
+            ((T*)d_blockSums[level], (const T*)d_blockSums[level],
+             (T**)d_blockSums, numBlocks, numRows, rowPitches, level + 1); // recursive (CPU) call
+        
+        vectorAddUniform4<T, op, SCAN_ELTS_PER_THREAD>
+            <<< grid, threads >>>(d_out, 
+                                  (T*)d_blockSums[level], 
+                                  numElements,
+                                  rowPitch*4,
+                                  blockSumRowPitch*4,
+                                  0, 0);
+        CUT_CHECK_ERROR("vectorAddUniform");
+    }
+}
+
+// global
+    
+#ifdef __cplusplus
+extern "C" 
+{
+#endif
+
+/** @brief Allocate intermediate arrays used by scan.
+  *
+  * Scans of large arrays must be split (possibly recursively) into a hierarchy 
+  * of block scans, where each block is scanned by a single CUDA thread block.  
+  * At each recursive level of the scan, we need an array in which to store the 
+  * total sums of all blocks in that level.  This function computes the amount 
+  * of storage needed and allocates it.
+  *
+  * @param plan Pointer to CUDPPScanPlan object containing options and number 
+  *             of elements, which is used to compute storage requirements, and
+  *             within which intermediate storage is allocated.
+  */
+void allocScanStorage(CUDPPScanPlan *plan)
+{
+    //assert(config->_numEltsAllocated == 0); // shouldn't be called 
+
+    plan->m_numEltsAllocated = plan->m_numElements;
+
+    size_t numElts = plan->m_numElements;
+    
+    size_t level = 0;
+
+    do
+    {       
+        size_t numBlocks = 
+            max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
+        if (numBlocks > 1)
+        {
+            level++;
+        }
+        numElts = numBlocks;
+    } while (numElts > 1);
+
+    size_t elementSize = 0;
+
+    switch(plan->m_config.datatype)
+    {
+    case CUDPP_INT:
+        plan->m_blockSums = (void**) malloc(level * sizeof(int*));
+        elementSize = sizeof(int);
+        break;
+    case CUDPP_UINT:
+        plan->m_blockSums = (void**) malloc(level * sizeof(unsigned int*));
+        elementSize = sizeof(unsigned int);
+        break;
+    case CUDPP_FLOAT:
+        plan->m_blockSums = (void**) malloc(level * sizeof(float*));
+        elementSize = sizeof(float);
+        break;
+    default:
+        break;
+    }
+
+    plan->m_numLevelsAllocated = level;
+    numElts = plan->m_numElements;
+    size_t numRows = plan->m_numRows;
+    plan->m_numRowsAllocated = numRows;
+    plan->m_rowPitches = 0;
+
+    if (numRows > 1)
+    {
+        plan->m_rowPitches = (size_t*) malloc((level + 1) * sizeof(size_t));
+        plan->m_rowPitches[0] = plan->m_rowPitch;
+    }
+
+    level = 0;
+
+    do
+    {       
+        size_t numBlocks = 
+            max(1, (unsigned int)ceil((double)numElts / ((double)SCAN_ELTS_PER_THREAD * CTA_SIZE)));
+        if (numBlocks > 1) 
+        {
+            // Use cudaMallocPitch for multi-row block sums to ensure alignment
+            if (numRows > 1)
+            {
+                size_t dpitch;
+                CUDA_SAFE_CALL( cudaMallocPitch((void**) &(plan->m_blockSums[level]), 
+                                                &dpitch,
+                                                numBlocks * elementSize, 
+                                                numRows));
+                plan->m_rowPitches[level+1] = dpitch / elementSize;
+                level++;
+            }
+            else
+            {
+                CUDA_SAFE_CALL(cudaMalloc((void**) &(plan->m_blockSums[level++]),  
+                                          numBlocks * elementSize));
+            }
+        }
+        numElts = numBlocks;
+    } while (numElts > 1);
+
+    CUT_CHECK_ERROR("allocScanStorage");
+}
+
+/** @brief Deallocate intermediate block sums arrays in a CUDPPScanPlan object.
+  *
+  * These arrays must have been allocated by allocScanStorage(), which is called
+  * by the constructor of cudppScanPlan().  
+  *
+  * @param plan Pointer to CUDPPScanPlan object initialized by allocScanStorage().
+  */
+void freeScanStorage(CUDPPScanPlan *plan)
+{
+    for (unsigned int i = 0; i < plan->m_numLevelsAllocated; i++)
+    {
+        cudaFree(plan->m_blockSums[i]);
+    }
+
+    CUT_CHECK_ERROR("freeScanStorage");
+
+    free((void**)plan->m_blockSums);
+    if (plan->m_numRows > 1)
+        free((void*)plan->m_rowPitches);
+
+    plan->m_blockSums = 0;
+    plan->m_numEltsAllocated = 0;
+    plan->m_numLevelsAllocated = 0;
+}
+
+
+/** @brief Dispatch function to perform a scan (prefix sum) on an
+  * array with the specified configuration.
+  *
+  * This is the dispatch routine which calls scanArrayRecursive() with 
+  * appropriate template parameters and arguments to achieve the scan as 
+  * specified in \a plan. 
+  * 
+  * @param[out] d_out    The output array of scan results
+  * @param[in]  d_in     The input array
+  * @param[in]  numElements The number of elements to scan
+  * @param[in]  numRows     The number of rows to scan in parallel
+  * @param[in]  plan     Pointer to CUDPPScanPlan object containing scan options
+  *                      and intermediate storage
+  */
+void cudppScanDispatch(void                *d_out, 
+                       const void          *d_in, 
+                       size_t              numElements,
+                       size_t              numRows,
+                       const CUDPPScanPlan *plan)
+{    
+    if (CUDPP_OPTION_EXCLUSIVE & plan->m_config.options)
+    {
+        if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
+        {
+            switch (plan->m_config.datatype)
+            {
+            case CUDPP_INT:
+
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<int, true, true, CUDPP_ADD>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<int, true, true, CUDPP_MULTIPLY>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<int, true, true, CUDPP_MAX>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<int, true, true, CUDPP_MIN>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+              
+                break;
+
+            case CUDPP_UINT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:                 
+                    scanArrayRecursive<unsigned int, true, true, CUDPP_ADD>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:                 
+                    scanArrayRecursive<unsigned int, true, true, CUDPP_MULTIPLY>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<unsigned int, true, true, CUDPP_MAX>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<unsigned int, true, true, CUDPP_MIN>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+
+                break;
+
+            case CUDPP_FLOAT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<float, true, true,  CUDPP_ADD>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<float, true, true,  CUDPP_MULTIPLY>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<float, true, true, CUDPP_MAX>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<float, true, true, CUDPP_MIN>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+                break; 
+
+            default:
+                break; 
+            }
+        }
+        else
+        {
+            switch (plan->m_config.datatype)
+            {
+            case CUDPP_INT:
+
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<int, false, true, CUDPP_ADD>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<int, false, true, CUDPP_MULTIPLY>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<int, false, true, CUDPP_MAX>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<int, false, true, CUDPP_MIN>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+
+                break;
+                    
+            case CUDPP_UINT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:                 
+                    scanArrayRecursive<unsigned int, false, true, CUDPP_ADD>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:                 
+                    scanArrayRecursive<unsigned int, false, true, CUDPP_MULTIPLY>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<unsigned int, false, true, CUDPP_MAX>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<unsigned int, false, true, CUDPP_MIN>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                            
+                }
+        
+                break;       
+            
+            case CUDPP_FLOAT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<float, false, true, CUDPP_ADD>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<float, false, true, CUDPP_MULTIPLY>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<float, false, true, CUDPP_MAX>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<float, false, true, CUDPP_MIN>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }            
+                break;
+
+            default:
+                break; 
+            }
+        }
+    }
+    else
+    {
+        if (CUDPP_OPTION_BACKWARD & plan->m_config.options)
+        {
+            switch (plan->m_config.datatype)
+            {
+            case CUDPP_INT:
+
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<int, true, false, CUDPP_ADD>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<int, true, false, CUDPP_MULTIPLY>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<int, true, false, CUDPP_MAX>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<int, true, false, CUDPP_MIN>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+              
+                break;
+
+            case CUDPP_UINT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:                 
+                    scanArrayRecursive<unsigned int, true, false, CUDPP_ADD>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:                 
+                    scanArrayRecursive<unsigned int, true, false, CUDPP_MULTIPLY>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<unsigned int, true, false, CUDPP_MAX>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<unsigned int, true, false, CUDPP_MIN>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+
+                break;
+
+            case CUDPP_FLOAT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<float, true, false, CUDPP_ADD>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<float, true, false, CUDPP_MULTIPLY>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<float, true, false, CUDPP_MAX>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<float, true, false, CUDPP_MIN>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+                break; 
+
+            default:
+                break; 
+            }
+        }
+        else
+        {
+            switch (plan->m_config.datatype)
+            {
+            case CUDPP_INT:
+
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<int, false, false, CUDPP_ADD>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<int, false, false, CUDPP_MULTIPLY>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<int, false, false, CUDPP_MAX>
+                        ((int*)d_out, (const int*)d_in, 
+                         (int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<int, false, false, CUDPP_MIN>
+                        ((int*)d_out, (const int*)d_in, 
+                        (int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }
+
+                break;
+                    
+            case CUDPP_UINT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:                 
+                    scanArrayRecursive<unsigned int, false, false, CUDPP_ADD>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:                 
+                    scanArrayRecursive<unsigned int, false, false, CUDPP_MULTIPLY>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<unsigned int, false, false, CUDPP_MAX>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                         (unsigned int**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<unsigned int, false, false, CUDPP_MIN>
+                        ((unsigned int*)d_out, (const unsigned int*)d_in, 
+                        (unsigned int**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                            
+                }
+        
+                break;       
+            
+            case CUDPP_FLOAT:
+                switch(plan->m_config.op)
+                {
+                case CUDPP_ADD:
+                    scanArrayRecursive<float, false, false, CUDPP_ADD>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MULTIPLY:
+                    scanArrayRecursive<float, false, false, CUDPP_MULTIPLY>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MAX:
+                    scanArrayRecursive<float, false, false, CUDPP_MAX>
+                        ((float*)d_out, (const float*)d_in, 
+                         (float**)plan->m_blockSums, 
+                         numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                case CUDPP_MIN:
+                    scanArrayRecursive<float, false, false, CUDPP_MIN>
+                        ((float*)d_out, (const float*)d_in, 
+                        (float**)plan->m_blockSums, 
+                        numElements, numRows, plan->m_rowPitches, 0);
+                    break;
+                default:
+                    break;
+                }            
+                break;
+
+            default:
+                break; 
+            }
+        }  
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+/** @} */ // end scan functions
+/** @} */ // end cudpp_app
--- a/lib/gpu/cudpp_mini/sharedmem.h
+++ b/lib/gpu/cudpp_mini/sharedmem.h
@ -0,0 +1,166 @@
+// -------------------------------------------------------------
+// cuDPP -- CUDA Data Parallel Primitives library
+// -------------------------------------------------------------
+// $Revision$
+// $Date$
+// ------------------------------------------------------------- 
+// This source code is distributed under the terms of license.txt 
+// in the root directory of this source distribution.
+// ------------------------------------------------------------- 
+
+/**
+ * @file
+ * sharedmem.h
+ *
+ * @brief Shared memory declaration struct for templatized types.
+ *
+ * Because dynamically sized shared memory arrays are declared "extern" in CUDA,
+ * we can't templatize their types directly.  To get around this, we declare a 
+ * simple wrapper struct that will declare the extern array with a different 
+ * name depending on the type.  This avoids linker errors about multiple
+ * definitions.
+ * 
+ * To use dynamically allocated shared memory in a templatized __global__ or 
+ * __device__ function, just replace code like this:
+ *
+ * <pre>
+ *  template<class T>
+ *  __global__ void
+ *  foo( T* d_out, T* d_in) 
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      extern __shared__  T sdata[];
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * </pre>
+ *  
+ *  With this
+ * <pre>
+ *  template<class T>
+ *  __global__ void
+ *  foo( T* d_out, T* d_in) 
+ *  {
+ *      // Shared mem size is determined by the host app at run time
+ *      SharedMemory<T> smem;
+ *      T* sdata = smem.getPointer();
+ *      ...
+ *      doStuff(sdata);
+ *      ...
+ *  }
+ * </pre>
+ */
+
+#ifndef _SHAREDMEM_H_
+#define _SHAREDMEM_H_
+
+
+/** @brief Wrapper class for templatized dynamic shared memory arrays.
+  * 
+  * This struct uses template specialization on the type \a T to declare
+  * a differently named dynamic shared memory array for each type
+  * (\code extern __shared__ T s_type[] \endcode).
+  * 
+  * Currently there are specializations for the following types:
+  * \c int, \c uint, \c char, \c uchar, \c short, \c ushort, \c long, 
+  * \c unsigned long, \c bool, \c float, and \c double. One can also specialize it
+  * for user defined types.
+  */
+template <typename T>
+struct SharedMemory
+{
+    /** Return a pointer to the runtime-sized shared memory array. **/
+    __device__ T* getPointer() 
+    { 
+        extern __device__ void Error_UnsupportedType(); // Ensure that we won't compile any un-specialized types
+        Error_UnsupportedType();
+        return (T*)0;
+    }
+    // TODO: Use operator overloading to make this class look like a regular array
+};
+
+// Following are the specializations for the following types.
+// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double
+// One could also specialize it for user-defined types.
+
+template <>
+struct SharedMemory <int>
+{
+    __device__ int* getPointer() { extern __shared__ int s_int[]; return s_int; }      
+};
+
+template <>
+struct SharedMemory <unsigned int>
+{
+    __device__ unsigned int* getPointer() { extern __shared__ unsigned int s_uint[]; return s_uint; }    
+};
+
+template <>
+struct SharedMemory <char>
+{
+    __device__ char* getPointer() { extern __shared__ char s_char[]; return s_char; }    
+};
+
+template <>
+struct SharedMemory <unsigned char>
+{
+    __device__ unsigned char* getPointer() { extern __shared__ unsigned char s_uchar[]; return s_uchar; }    
+};
+
+template <>
+struct SharedMemory <short>
+{
+    __device__ short* getPointer() { extern __shared__ short s_short[]; return s_short; }    
+};
+
+template <>
+struct SharedMemory <unsigned short>
+{
+    __device__ unsigned short* getPointer() { extern __shared__ unsigned short s_ushort[]; return s_ushort; }    
+};
+
+template <>
+struct SharedMemory <long>
+{
+    __device__ long* getPointer() { extern __shared__ long s_long[]; return s_long; }    
+};
+
+template <>
+struct SharedMemory <unsigned long>
+{
+    __device__ unsigned long* getPointer() { extern __shared__ unsigned long s_ulong[]; return s_ulong; }    
+};
+
+template <>
+struct SharedMemory <bool>
+{
+    __device__ bool* getPointer() { extern __shared__ bool s_bool[]; return s_bool; }    
+};
+
+template <>
+struct SharedMemory <float>
+{
+    __device__ float* getPointer() { extern __shared__ float s_float[]; return s_float; }    
+};
+
+template <>
+struct SharedMemory <double>
+{
+    __device__ double* getPointer() { extern __shared__ double s_double[]; return s_double; }    
+};
+
+template <>
+struct SharedMemory <uchar4>
+{
+    __device__ uchar4* getPointer() { extern __shared__ uchar4 s_uchar4[]; return s_uchar4; }    
+};
+
+
+#endif //_SHAREDMEM_H_
+
+// Leave this at the end of the file
+// Local Variables:
+// mode:c++
+// c-file-style: "NVIDIA"
+// End:
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@ -0,0 +1,449 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "gb_gpu_memory.h"
+
+using namespace std;
+
+static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF;
+#define GBMT GB_GPU_Memory<numtyp,acctyp>
+
+template<class numtyp, class acctyp>
+void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
+                const int inum, const int form_low, const int form_high) {
+  int stride=gbm.nbor->nbor_pitch();
+  int anall=gbm.atom->nall();
+  if (gbm.shared_types) {
+    GBMF.k_gb_nbor_fast.set_size(GX,BX);
+    GBMF.k_gb_nbor_fast.run(&gbm.atom->dev_x.begin(),
+              &gbm.cut_form.begin(), &gbm.nbor->dev_nbor.begin(), &stride,
+              &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
+              &form_high, &anall);
+  } else {
+    GBMF.k_gb_nbor.set_size(GX,BX);
+    GBMF.k_gb_nbor.run(&gbm.atom->dev_x.begin(), &gbm.cut_form.begin(),
+              &gbm._lj_types, &gbm.nbor->dev_nbor.begin(), &stride,
+              &start, &inum, &gbm.nbor->dev_packed.begin(), &form_low,
+              &form_high, &anall);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+bool gb_gpu_init(const int ntypes, const double gamma,
+                 const double upsilon, const double mu, double **shape,
+                 double **well, double **cutsq, double **sigma,
+                 double **epsilon, double *host_lshape, int **form,
+                 double **host_lj1, double **host_lj2, double **host_lj3,
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const double cell_size, int &gpu_mode, FILE *screen) {
+  GBMF.clear();
+  gpu_mode=GBMF.device->gpu_mode();
+  double gpu_split=GBMF.device->particle_split();
+  int first_gpu=GBMF.device->first_device();
+  int last_gpu=GBMF.device->last_device();
+  int world_me=GBMF.device->world_me();
+  int gpu_rank=GBMF.device->gpu_rank();
+  int procs_per_gpu=GBMF.device->procs_per_gpu();
+
+  GBMF.device->init_message(screen,"gayberne",first_gpu,last_gpu);
+
+  bool message=false;
+  if (world_me==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  if (world_me==0) {
+    bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+                           sigma, epsilon, host_lshape, form, host_lj1, 
+                           host_lj2, host_lj3, host_lj4, offset, special_lj, 
+                           inum, nall, max_nbors, cell_size, gpu_split, screen);
+    if (!init_ok)
+      return false;
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (message)
+    fprintf(screen,"Done.\n");
+        
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0) {
+      bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+                             sigma, epsilon, host_lshape, form, host_lj1, 
+                             host_lj2, host_lj3, host_lj4, offset, special_lj, 
+                             inum, nall, max_nbors, cell_size, gpu_split, 
+                             screen);
+      if (!init_ok)
+        return false;
+    }
+    MPI_Barrier(GBMF.device->gpu_comm);
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return true;
+}
+
+// ---------------------------------------------------------------------------
+// Clear memory on host and device
+// ---------------------------------------------------------------------------
+void gb_gpu_clear() {
+  GBMF.clear();
+}
+
+// ---------------------------------------------------------------------------
+// Build neighbor list on device
+// ---------------------------------------------------------------------------
+template <class gbmtyp>
+inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
+                                    const int host_inum, const int nall, 
+                                    double **host_x, double **host_quat,
+                                    int *host_type, double *boxlo,
+                                    double *boxhi, bool &success) {
+  gbm.nbor_time_avail=true;
+
+  success=true;
+  gbm.resize_atom(inum,nall,success);
+  gbm.resize_local(inum,host_inum,gbm.nbor->max_nbors(),0,success);
+  if (!success)
+    return;
+    
+  gbm.atom->cast_copy_x(host_x,host_type);
+  int mn;
+  gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
+                            boxlo, boxhi, NULL, NULL, NULL, success, mn);
+  gbm.nbor->copy_unpacked(inum,mn);
+  gbm.last_ellipse=inum;
+  gbm.max_last_ellipse=inum;
+}
+
+// ---------------------------------------------------------------------------
+// Copy neighbor list from host and (if spheres) reorder so ellipses first
+// ---------------------------------------------------------------------------
+template <class gbmtyp>
+void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
+                          const int inum, const int osize,
+                          int *ilist, int *numj,
+                          int *type, int **firstneigh,
+                          bool &success) {
+  success=true;
+    
+  gbm.nbor_time_avail=true;
+
+  int mn=gbm.nbor->max_nbor_loop(inum,numj);
+  gbm.resize_atom(inum,nall,success);
+  gbm.resize_local(inum,0,mn,osize,success);
+  if (!success)
+    return;
+    
+  if (gbm.multiple_forms) {
+    int p=0;
+    for (int i=0; i<osize; i++) {
+      int itype=type[ilist[i]];
+      if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
+        gbm.host_olist[p]=ilist[i];
+        p++;
+      }
+    }
+    gbm.max_last_ellipse=p;
+    gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
+    for (int i=0; i<osize; i++) {
+      int itype=type[ilist[i]];
+      if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
+        gbm.host_olist[p]=ilist[i];
+        p++;
+      }
+    }
+    gbm.nbor->get_host(inum,gbm.host_olist.begin(),numj,firstneigh,
+                      gbm.block_size());
+    gbm.nbor->copy_unpacked(inum,mn);
+    return;
+  }
+  gbm.last_ellipse=inum;
+  gbm.max_last_ellipse=inum;
+  gbm.nbor->get_host(inum,ilist,numj,firstneigh,gbm.block_size());
+  gbm.nbor->copy_unpacked(inum,mn);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=gbm.block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
+  int stride=gbm.nbor->nbor_pitch();
+  int ainum=gbm.atom->inum();
+  int anall=gbm.atom->nall();
+
+  if (gbm.multiple_forms) {
+    gbm.time_kernel.start();
+    if (gbm.last_ellipse>0) {
+      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
+                               static_cast<double>(BX)));
+      gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
+			ELLIPSE_ELLIPSE);
+      gbm.time_kernel.stop();
+
+      gbm.time_gayberne.start();
+      GBMF.k_gayberne.set_size(GX,BX);
+      GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(),
+           &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
+           &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
+           &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
+           &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
+           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
+      gbm.time_gayberne.stop();
+
+      if (gbm.last_ellipse==gbm.atom->inum()) {
+        gbm.time_kernel2.start();
+        gbm.time_kernel2.stop();
+        gbm.time_gayberne2.start();
+        gbm.time_gayberne2.stop();
+        gbm.time_pair.start();
+        gbm.time_pair.stop();
+        return;
+      }
+
+      // ------------ SPHERE_ELLIPSE ---------------
+
+      gbm.time_kernel2.start();
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
+                               gbm.last_ellipse)/BX));
+      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
+			SPHERE_ELLIPSE,SPHERE_ELLIPSE);
+      gbm.time_kernel2.stop();
+
+      gbm.time_gayberne2.start();
+      GBMF.k_sphere_gb.set_size(GX,BX);
+      GBMF.k_sphere_gb.run(&gbm.atom->dev_x.begin(),&gbm.atom->dev_quat.begin(),
+              &gbm.shape.begin(), &gbm.well.begin(), 
+              &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
+              &gbm._lj_types, &gbm.lshape.begin(), 
+              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
+              &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
+              &vflag, &gbm.last_ellipse, &ainum, &anall);
+      gbm.time_gayberne2.stop();
+   } else {
+      gbm.atom->dev_ans.zero();
+      gbm.atom->dev_engv.zero();
+      gbm.time_kernel.stop();
+      gbm.time_gayberne.start();                                 
+      gbm.time_gayberne.stop();
+      gbm.time_kernel2.start();
+      gbm.time_kernel2.stop();
+      gbm.time_gayberne2.start();
+      gbm.time_gayberne2.stop();
+    }
+    
+    // ------------         LJ      ---------------
+    gbm.time_pair.start();
+    if (gbm.last_ellipse<gbm.atom->inum()) {
+      if (gbm.shared_types) {
+        GBMF.k_lj_fast.set_size(GX,BX);
+        GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
+                           &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
+                           &stride, &gbm.nbor->dev_packed.begin(),
+                           &gbm.atom->dev_ans.begin(),
+                           &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+      } else {
+        GBMF.k_lj.set_size(GX,BX);
+        GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
+                      &gbm.lj3.begin(), &gbm._lj_types, 
+                      &gbm.gamma_upsilon_mu.begin(), &stride, 
+                      &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
+                      &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+      }
+    }
+    gbm.time_pair.stop();
+  } else {
+    gbm.time_kernel.start();
+    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
+		      ELLIPSE_ELLIPSE);
+    gbm.time_kernel.stop();
+    gbm.time_gayberne.start(); 
+    GBMF.k_gayberne.set_size(GX,BX);
+    GBMF.k_gayberne.run(&gbm.atom->dev_x.begin(), &gbm.atom->dev_quat.begin(),
+            &gbm.shape.begin(), &gbm.well.begin(), 
+            &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
+            &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
+            &stride, &gbm.atom->dev_ans.begin(), &ainum,
+            &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
+            &eflag, &vflag, &ainum, &anall);
+    gbm.time_gayberne.stop();
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Reneighbor on GPU if necessary and then compute forces, torques, energies
+// ---------------------------------------------------------------------------
+template <class gbmtyp>
+inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
+		               const int inum_full, const int nall,
+			       double **host_x, int *host_type,
+			       double *boxlo, double *boxhi, const bool eflag,
+			       const bool vflag, const bool eatom,
+                               const bool vatom, int &host_start,
+		               const double cpu_time, bool &success,
+			       double **host_quat) {
+  gbm.acc_timers();
+  if (inum_full==0) {
+    gbm.zero_timers();
+    return NULL;
+  }
+
+  gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
+  int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
+  gbm.atom->inum(inum);
+  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
+  host_start=inum;
+  
+  // Build neighbor list on GPU if necessary
+  if (ago==0) {
+    _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
+                            host_quat, host_type, boxlo, boxhi, success);
+    if (!success)
+      return NULL;
+    gbm.atom->cast_quat_data(host_quat[0]);
+    gbm.hd_balancer.start_timer();
+  } else {    
+    gbm.atom->cast_x_data(host_x,host_type);
+    gbm.atom->cast_quat_data(host_quat[0]);
+    gbm.hd_balancer.start_timer();
+    gbm.atom->add_x_data(host_x,host_type);
+  }
+
+  gbm.atom->add_other_data();
+
+  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
+  gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.hd_balancer.stop_timer();
+  return gbm.device->nbor.host_nbor.begin();
+}
+
+int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+	 	       const int nall, double **host_x, int *host_type,
+                       double *boxlo, double *boxhi, const bool eflag,
+		       const bool vflag, const bool eatom, const bool vatom,
+                       int &host_start, const double cpu_time, bool &success,
+		       double **host_quat) {
+  return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
+			   host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
+                           host_start, cpu_time, success, host_quat);
+}  
+
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, torques,..
+// ---------------------------------------------------------------------------
+template <class gbmtyp>
+inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
+			     const int inum_full,const int nall,double **host_x,
+			     int *host_type, int *ilist, int *numj,
+			     int **firstneigh, const bool eflag,
+			     const bool vflag, const bool eatom,
+                             const bool vatom, int &host_start,
+			     const double cpu_time, bool &success,
+			     double **host_quat) {
+  gbm.acc_timers();
+  if (inum_full==0) {
+    gbm.zero_timers();
+    return NULL;
+  }
+  
+  int ago=gbm.hd_balancer.ago_first(f_ago);
+  int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
+				   gbm.nbor->gpu_nbor());
+  gbm.atom->inum(inum);
+  gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
+  host_start=inum;
+
+  if (ago==0) {
+    _gb_gpu_reset_nbors(gbm, nall, inum, inum_full, ilist, numj, host_type,
+		        firstneigh, success);
+    if (!success)
+      return NULL;
+  }
+  int *list;
+  if (gbm.multiple_forms)
+    list=gbm.host_olist.begin();
+  else
+    list=ilist;
+
+  gbm.atom->cast_x_data(host_x,host_type);
+  gbm.atom->cast_quat_data(host_quat[0]);
+  gbm.hd_balancer.start_timer();
+  gbm.atom->add_x_data(host_x,host_type);
+  gbm.atom->add_other_data();
+
+  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
+  gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.hd_balancer.stop_timer();
+  return list;
+}
+
+int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success, double **host_quat) {
+  return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
+			 host_type, ilist, numj, firstneigh, eflag, vflag,
+			 eatom, vatom, host_start, cpu_time, success,
+                         host_quat);
+}
+
+// ---------------------------------------------------------------------------
+// Return memory usage
+// ---------------------------------------------------------------------------
+double gb_gpu_bytes() {
+  return GBMF.host_memory_usage();
+}
--- a/lib/gpu/gb_gpu.cu
+++ b/lib/gpu/gb_gpu.cu
@ -1,595 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
-
-#include <iostream>
-#include <cassert>
-#include "nvc_macros.h"
-#include "nvc_timer.h"
-#include "nvc_device.h"
-#include "gb_gpu_memory.cu"
-#include "gb_gpu_kernel.h"
-
-using namespace std;
-
-static GB_GPU_Memory<PRECISION,ACC_PRECISION> GBMF[MAX_GPU_THREADS];
-#define GBMT GB_GPU_Memory<numtyp,acctyp>
-
-// ---------------------------------------------------------------------------
-// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only pack neighbors matching the specified inclusive range of forms
-// -- Only pack neighbors within cutoff
-// ---------------------------------------------------------------------------
-template<class numtyp>
-__global__ void kernel_pack_nbor(const vec4 *x_, int *dev_nbor, const int nbor_pitch, 
-                                 const int start, const int inum, 
-                                 const int *dev_ij, const int form_low, 
-                                 const int form_high, const int nall) {
-                                
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x+INT_MUL(blockIdx.x,blockDim.x)+start;
-
-  if (ii<inum) {
-    int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-    nbor+=nbor_pitch;
-    int *nbor_newj=nbor;
-    nbor+=nbor_pitch;
-  
-    vec4 ix=x_[i];
-    int itype=ix.w;
-
-    int newj=0;  
-    for ( ; list<list_end; list++) {
-      int j=*list;
-      if (j>=nall)
-        j%=nall;
-      vec4 jx=x_[j];
-      int jtype=jx.w;
-      
-      if (_form_(itype,jtype)>=form_low && _form_(itype,jtype)<=form_high) {
-        // Compute r12;
-        numtyp rsq=jx.x-ix.x;
-        rsq*=rsq;
-        numtyp t=jx.y-ix.y;
-        rsq+=t*t;
-        t=jx.z-ix.z;
-        rsq+=t*t;
-
-        if (rsq< _cutsq_<numtyp>(itype,jtype)) {
-          *nbor=j;
-          nbor+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    *nbor_newj=newj;
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Pack neighbors from dev_ij array into dev_nbor matrix for coalesced access
-// -- Only pack neighbors matching the specified inclusive range of forms
-// -- Only pack neighbors within cutoff
-// -- Fast version of routine that uses shared memory for LJ constants
-// ---------------------------------------------------------------------------
-template<class numtyp>
-__global__ void kernel_pack_nbor_fast(const vec4 *x_, int *dev_nbor, const int nbor_pitch, 
-                                      const int start, const int inum, 
-                                      const int *dev_ij, const int form_low, 
-                                      const int form_high, const int nall) {
-                                
-  int ii=threadIdx.x;
-  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=ii/MAX_SHARED_TYPES;
-    int jtype=ii%MAX_SHARED_TYPES;
-    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
-    form[ii]=_form_(itype,jtype);
-  }
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-  __syncthreads();
-
-  if (ii<inum) {
-    int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-    nbor+=nbor_pitch;
-    int *nbor_newj=nbor;
-    nbor+=nbor_pitch;
-  
-    vec4 ix=x_[i];
-    int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
-
-    int newj=0;  
-    for ( ; list<list_end; list++) {
-      int j=*list;
-      if (j>=nall)
-        j%=nall;
-      vec4 jx=x_[j];
-      int jtype=jx.w;
-      int mtype=itype+jtype;
-      
-      if (form[mtype]>=form_low && form[mtype]<=form_high) {
-        // Compute r12;
-        numtyp rsq=jx.x-ix.x;
-        rsq*=rsq;
-        numtyp t=jx.y-ix.y;
-        rsq+=t*t;
-        t=jx.z-ix.z;
-        rsq+=t*t;
-
-        if (rsq<cutsq[mtype]) {
-          *nbor=j;
-          nbor+=nbor_pitch;
-          newj++;
-        }
-      }
-    }
-    *nbor_newj=newj;
-  }
-}
-
-template<class numtyp, class acctyp>
-void pack_nbors(GBMT &gbm, const int GX, const int BX, const int start, 
-                const int inum, const int form_low, const int form_high) {
-  if (gbm.shared_types) {
-    kernel_pack_nbor_fast<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), 
-           gbm.atom.inum(), start, inum,
-           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
-  } else
-    kernel_pack_nbor<numtyp><<<GX,BX,0,gbm.pair_stream>>>
-          ((vec4 *)gbm.atom.dev_x.begin(),gbm.nbor.dev_nbor.begin(), 
-           gbm.atom.inum(), start, inum,
-           gbm.nbor.ij.begin(),form_low,form_high,gbm.atom.nall());
-}
-
-// ---------------------------------------------------------------------------
-// Convert something to a string
-// ---------------------------------------------------------------------------
-#include <sstream>
-template <class t>
-inline string gb_gpu_toa(const t& in) {
-  ostringstream o;
-  o.precision(2);
-  o << in;
-  return o.str();
-}
-
-// ---------------------------------------------------------------------------
-// Return string with GPU info
-// ---------------------------------------------------------------------------
-EXTERN void gb_gpu_name(const int id, const int max_nbors, char * name) {
-  string sname=GBMF[0].gpu.name(id)+", "+
-              gb_gpu_toa(GBMF[0].gpu.cores(id))+" cores, "+
-              gb_gpu_toa(GBMF[0].gpu.gigabytes(id))+" GB, "+
-              gb_gpu_toa(GBMF[0].gpu.clock_rate(id))+" GHZ";
-  strcpy(name,sname.c_str());
-}
-
-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-EXTERN bool gb_gpu_init(int &ij_size, const int ntypes, const double gamma,
-                  const double upsilon, const double mu, double **shape,
-                  double **well, double **cutsq, double **sigma, 
-                  double **epsilon, double *host_lshape, int **form,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int nlocal, const int nall, 
-                  const int max_nbors, const int thread, const int gpu_id) {
-  assert(thread<MAX_GPU_THREADS);
-  
-  GBMF[thread].gpu.init();
-
-  if (GBMF[thread].gpu.num_devices()==0)
-    return false;                   
-
-  ij_size=IJ_SIZE;
-  return GBMF[thread].init(ij_size, ntypes, gamma, upsilon, mu, shape,
-                           well, cutsq, sigma, epsilon, host_lshape, form,
-                           host_lj1, host_lj2, host_lj3, host_lj4, offset,
-                           special_lj, nlocal, nall, max_nbors, false, 
-                           gpu_id);
-}
-
-// ---------------------------------------------------------------------------
-// Clear memory on host and device
-// ---------------------------------------------------------------------------
-EXTERN void gb_gpu_clear(const int thread) {
-  GBMF[thread].clear();
-}
-
-// ---------------------------------------------------------------------------
-// copy atom positions, quaternions, and optionally types to device
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-inline void _gb_gpu_atom(PairGPUAtom<numtyp,acctyp> &atom, double **host_x, 
-                          double **host_quat, const int *host_type, 
-                          const bool rebuild, cudaStream_t &stream) {
-  atom.time_atom.start();
-  atom.reset_write_buffer();
- 
-  // Rows 1-3 of dev_x are position; rows 4-7 are quaternion
-  atom.add_x_data(host_x,host_type);
-  atom.add_q_data(host_quat[0]);
-
-  atom.copy_x_data(stream);
-  atom.copy_q_data(stream);
-  atom.time_atom.stop();
-}
-
-EXTERN void gb_gpu_atom(double **host_x, double **host_quat, 
-                        const int *host_type, const bool rebuild, 
-                        const int thread) {
-  _gb_gpu_atom(GBMF[thread].atom, host_x, host_quat, host_type, rebuild,
-               GBMF[thread].pair_stream);
-}
-
-// ---------------------------------------------------------------------------
-// Signal that we need to transfer a new neighbor list
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-int * _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall, const int nlocal, 
-                          const int inum, int *ilist, const int *numj,
-                          const int *type, bool &success) {
-  success=true;
-    
-  gbm.nbor.time_nbor.start();
-
-  int mn=0;
-  for (int i=0; i<inum; i++)
-    mn=std::max(mn,numj[i]);
-
-  if (nall>gbm.max_atoms)
-    gbm.resize_atom(nall,success);  
-  if (nlocal>gbm.max_local || mn>gbm._max_nbors)
-    gbm.resize_local(nlocal,mn,success);
-  if (!success)
-    return false;
-    
-  gbm.atom.nall(nall);
-  gbm.atom.inum(inum);
-
-  if (gbm.multiple_forms) {
-    int ij_size=gbm.nbor.host_ij.numel();
-    if (inum*2<ij_size) {
-      int p=0, acc=0;
-      for (int i=0; i<inum; i++) {
-        int itype=type[ilist[i]];
-        if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
-          gbm.host_olist[p]=ilist[i];
-          gbm.nbor.host_ij[p]=numj[ilist[i]];
-          gbm.nbor.host_ij[p+inum]=acc;
-          acc+=numj[ilist[i]];
-          p++;
-        }
-      }
-      gbm.last_ellipse=p;
-      for (int i=0; i<inum; i++) {
-        int itype=type[ilist[i]];
-        if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
-          gbm.host_olist[p]=ilist[i];
-          gbm.nbor.host_ij[p]=numj[ilist[i]];
-          gbm.nbor.host_ij[p+inum]=acc;
-          acc+=numj[ilist[i]];
-          p++;
-        }
-      }
-      gbm.nbor.ij_total=0;
-      gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
-      gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum,
-                                      2*inum,gbm.pair_stream);
-    } else {
-      int p=0, acc=0;
-      int offset=0;
-      int half=ij_size/2;
-      int hi=0;
-      for (int i=0; i<inum; i++) {
-        int itype=type[ilist[i]];
-        if (gbm.host_form[itype][itype]==ELLIPSE_ELLIPSE) {
-          gbm.host_olist[p]=ilist[i];
-          gbm.nbor.host_ij[hi]=numj[ilist[i]];
-          gbm.nbor.host_ij[hi+half]=acc;
-          acc+=numj[ilist[i]];
-          p++;
-          hi++;
-          if (hi==half) {
-            gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
-                                            half,gbm.pair_stream);
-            gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
-                                                 inum*2+offset,
-                                            half,gbm.pair_stream);
-            hi=0;
-            offset+=half;
-            CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
-          }
-        }
-      }
-      gbm.last_ellipse=p;
-      for (int i=0; i<inum; i++) {
-        int itype=type[ilist[i]];
-        if (gbm.host_form[itype][itype]!=ELLIPSE_ELLIPSE) {
-          gbm.host_olist[p]=ilist[i];
-          gbm.nbor.host_ij[hi]=numj[ilist[i]];
-          gbm.nbor.host_ij[hi+half]=acc;
-          acc+=numj[ilist[i]];
-          p++;
-          hi++;
-          if (hi==half) {
-            gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
-                                            half,gbm.pair_stream);
-            gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
-                                                 inum*2+offset,
-                                            half,gbm.pair_stream);
-            hi=0;
-            offset+=half;
-            CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
-          }
-        }
-      }
-      gbm.nbor.dev_nbor.copy_from_host(gbm.host_olist.begin(),inum);
-      if (hi>0) {
-        gbm.nbor.host_ij.copy_to_device(gbm.nbor.dev_nbor.begin()+inum+offset,
-                                        hi,gbm.pair_stream);
-        gbm.nbor.host_ij.copy_to_device(half,gbm.nbor.dev_nbor.begin()+
-                                             inum*2+offset,
-                                        hi,gbm.pair_stream);
-      }
-      gbm.nbor.ij_total=0;
-    }
-  } else {
-    gbm.nbor.reset(inum,ilist,numj,gbm.pair_stream);
-    gbm.last_ellipse=inum;
-  }
-
-  gbm.nbor.time_nbor.stop();
-  
-  if (gbm.multiple_forms)
-    return gbm.host_olist.begin();
-  return ilist;
-}
-
-EXTERN int * gb_gpu_reset_nbors(const int nall, const int nlocal, const int inum, 
-                         int *ilist, const int *numj, const int *type,
-                         const int thread, bool &success) {
-  return _gb_gpu_reset_nbors(GBMF[thread],nall,nlocal,inum,ilist,numj,type,
-                             success);
-}
-
-// ---------------------------------------------------------------------------
-// Copy a set of ij_size ij interactions to device and compute energies,
-// forces, and torques for those interactions
-// ---------------------------------------------------------------------------
-template <class gbmtyp>
-void _gb_gpu_nbors(gbmtyp &gbm, const int *ij, const int num_ij, 
-        const bool eflag) {
-  gbm.nbor.time_nbor.add_to_total();
-  // CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream)); // Not if timed
-  
-  memcpy(gbm.nbor.host_ij.begin(),ij,num_ij*sizeof(int));
-  gbm.nbor.time_nbor.start();
-  gbm.nbor.add(num_ij,gbm.pair_stream);
-  gbm.nbor.time_nbor.stop();
-}
-
-EXTERN void gb_gpu_nbors(const int *ij, const int num_ij, const bool eflag,
-            const int thread) {
-  _gb_gpu_nbors(GBMF[thread],ij,num_ij,eflag);
-}
-
-
-template<class numtyp, class acctyp>
-void _gb_gpu_enqueue(GBMT &gbm, const bool eflag, const bool vflag) {
-  gbm.atom.time_answer.start();
-  gbm.atom.copy_answers(eflag,vflag,gbm.pair_stream);
-  gbm.atom.time_answer.stop();
-}
-
-// ---------------------------------------------------------------------------
-// Calculate energies, forces, and torques for all ij interactions
-// ---------------------------------------------------------------------------
-template <class numtyp, class acctyp>
-void _gb_gpu_gayberne(GBMT &gbm, const bool eflag, const bool vflag, 
-                      const bool rebuild) {
-  // Compute the block size and grid size to keep all cores busy
-  const int BX=BLOCK_1D;
-  int ans_pitch=6;
-  if (eflag)
-    ans_pitch++;
-  if (vflag)
-    ans_pitch+=6;
-  
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum())/BX));
-
-  if (gbm.multiple_forms) {
-    gbm.time_kernel.start();
-    if (gbm.last_ellipse>0) {
-      // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
-                               static_cast<double>(BX)));
-      pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,SPHERE_ELLIPSE,ELLIPSE_ELLIPSE);
-      gbm.time_kernel.stop();
-  
-      gbm.time_gayberne.start();                                 
-      kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
-            gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
-            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
-            eflag, vflag, gbm.last_ellipse, gbm.atom.nall());
-      gbm.time_gayberne.stop();
-
-      if (gbm.last_ellipse==gbm.atom.inum()) {
-        gbm.time_kernel2.start();
-        gbm.time_kernel2.stop();
-        gbm.time_gayberne2.start();
-        gbm.time_gayberne2.stop();
-        gbm.time_pair.start();
-        gbm.time_pair.stop();
-        return;
-      }
-                    
-      // ------------ SPHERE_ELLIPSE ---------------
-        
-      gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom.inum()-
-                               gbm.last_ellipse)/BX));
-      pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom.inum(),ELLIPSE_SPHERE,
-                  ELLIPSE_SPHERE);
-      gbm.time_kernel2.stop();
-
-      gbm.time_gayberne2.start();
-      kernel_sphere_gb<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
-            gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
-            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
-            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
-      gbm.time_gayberne2.stop();
-   } else {
-      gbm.atom.ans.zero();
-      gbm.time_kernel.stop();
-      gbm.time_gayberne.start();                                 
-      gbm.time_gayberne.stop();
-      gbm.time_kernel2.start();
-      gbm.time_kernel2.stop();
-      gbm.time_gayberne2.start();
-      gbm.time_gayberne2.stop();
-    }
-    
-    // ------------         LJ      ---------------
-    gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.atom.inum()) {
-      if (gbm.shared_types)
-        kernel_lj_fast<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
-            gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), eflag, 
-            vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
-      else
-        kernel_lj<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-           ((vec4*)gbm.atom.dev_x.begin(), gbm.special_lj.begin(), 
-            gbm.nbor.dev_nbor.begin(), gbm.atom.inum(), gbm.nbor.ij.begin(),
-            gbm.atom.ans.begin(), ans_pitch,gbm.dev_error.begin(),
-            eflag, vflag, gbm.last_ellipse, gbm.atom.inum(), gbm.atom.nall());
-    }
-    gbm.time_pair.stop();
-  } else {
-    gbm.time_kernel.start();
-    pack_nbors(gbm, GX, BX, 0, gbm.atom.inum(),SPHERE_SPHERE,ELLIPSE_ELLIPSE);
-    gbm.time_kernel.stop();
-  
-    gbm.time_gayberne.start(); 
-    kernel_gayberne<numtyp,acctyp><<<GX,BX,0,gbm.pair_stream>>>
-         ((vec4*)gbm.atom.dev_x.begin(), (vec4*)gbm.atom.dev_q.begin(), 
-          gbm.gamma_upsilon_mu.begin(), gbm.special_lj.begin(), 
-          gbm.nbor.dev_nbor.begin(), gbm.atom.inum(),
-          gbm.atom.ans.begin(), ans_pitch, gbm.dev_error.begin(), 
-          eflag, vflag, gbm.atom.inum(), gbm.atom.nall());
-    gbm.time_gayberne.stop();
-  }
-}
-
-EXTERN void gb_gpu_gayberne(const bool eflag, const bool vflag, const bool rebuild, 
-                            const int thread) {
-  _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag,rebuild);
-  _gb_gpu_enqueue<PRECISION,ACC_PRECISION>(GBMF[thread],eflag,vflag);
-}
-
-// ---------------------------------------------------------------------------
-// Get energies, forces, and torques to host
-// ---------------------------------------------------------------------------
-template<class numtyp, class acctyp>
-double _gb_gpu_forces(GBMT &gbm, double **f, double **tor, const int *ilist,
-                      const bool eflag, const bool vflag, const bool eflag_atom,
-                      const bool vflag_atom, double *eatom, double **vatom,
-                      double *virial) {
-  double evdw;
-
-  gbm.atom.time_atom.add_to_total();
-  gbm.nbor.time_nbor.add_to_total();
-  gbm.time_kernel.add_to_total();
-  gbm.time_gayberne.add_to_total();
-  if (gbm.multiple_forms) {
-    gbm.time_kernel2.add_to_total();
-    gbm.time_gayberne2.add_to_total();
-    gbm.time_pair.add_to_total();
-  }      
-  CUDA_SAFE_CALL(cudaStreamSynchronize(gbm.pair_stream));
-  if (gbm.last_ellipse>gbm.atom.inum()) {
-    if (eflag || vflag)
-      evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
-                                  f,tor,gbm.atom.inum());
-    else
-      gbm.atom.copy_asphere(ilist,f,tor,gbm.atom.inum());
-  } else {
-    if (eflag || vflag)
-      evdw=gbm.atom.energy_virial(ilist,eflag_atom,vflag_atom,eatom,vatom,virial,
-                                  f,tor,gbm.last_ellipse);
-    else
-      gbm.atom.copy_asphere(ilist,f,tor,gbm.last_ellipse);
-  }
-  gbm.atom.time_answer.add_to_total();
-  return evdw;
-}
-
-EXTERN double gb_gpu_forces(double **f, double **tor, const int *ilist,
-                     const bool eflag, const bool vflag, const bool eflag_atom,
-                     const bool vflag_atom, double *eatom, double **vatom,
-                     double *virial, const int thread) {
-  return _gb_gpu_forces<PRECISION,ACC_PRECISION>
-                       (GBMF[thread],f,tor,ilist,eflag,vflag,eflag_atom,
-                        vflag_atom,eatom,vatom,virial);
-}
-
-EXTERN void gb_gpu_time(const int i) {
-  cout.precision(4);
-  cout << "Atom copy:     " << GBMF[i].atom.time_atom.total_seconds() 
-       << " s.\n"
-       << "Neighbor copy: " << GBMF[i].nbor.time_nbor.total_seconds() 
-       << " s.\n"
-       << "Neighbor pack: " << GBMF[i].time_kernel.total_seconds()+
-                               GBMF[i].time_kernel2.total_seconds() << " s.\n"
-       << "Force calc:    " << GBMF[i].time_gayberne.total_seconds()+
-                               GBMF[i].time_gayberne2.total_seconds()<< " s.\n";
-  if (GBMF[i].multiple_forms)
-    cout << "LJ calc:       " << GBMF[i].time_pair.total_seconds() << " s.\n";
-  cout << "Answer copy:   " << GBMF[i].atom.time_answer.total_seconds() 
-       << " s.\n";
-}
-
-EXTERN int gb_gpu_num_devices() {
-  return GBMF[0].gpu.num_devices();
-}
-
-EXTERN double gb_gpu_bytes() {
-  return GBMF[0].host_memory_usage();
-}
-
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@ -12,44 +12,60 @@
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */

 #ifndef GB_GPU_EXTRA_H
 #define GB_GPU_EXTRA_H

-#include "math.h"
-#include "stdio.h"
-#include "string.h"
+#define MAX_SHARED_TYPES 8
+enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

-/* ----------------------------------------------------------------------
-   Atomic update of global memory
------------------------------------------------------------------------- */
-/*
-template <class numtyp> __device__ 
-inline void atomicAdd(numtyp *address, numtyp val);
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif

-template <>
-__device__ inline void atomicAdd<float>(float *address, float val)
-{
-  int i_val = __float_as_int(val);
-  int tmp0 = 0;
-  int tmp1;
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif

-  while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) {
-    tmp0 = tmp1;
-    i_val = __float_as_int(val + __int_as_float(tmp1));
-  }
-}*/
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#endif

 /* ----------------------------------------------------------------------
   dot product of 2 vectors
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
+__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
 {
  return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
 }
@ -58,9 +74,7 @@ static __inline__ __device__ numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
   cross product of 2 vectors
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ void gpu_cross3(const numtyp *v1, 
-                                             const numtyp *v2, numtyp *ans)
+__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
 {
  ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
  ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
@ -71,8 +85,7 @@ static __inline__ __device__ void gpu_cross3(const numtyp *v1,
   determinant of a matrix
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
+__inline numtyp gpu_det3(const numtyp m[9])
 {
  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
@ -84,47 +97,25 @@ static __inline__ __device__ numtyp gpu_det3(const numtyp m[9])
   diagonal matrix times a full matrix
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ void gpu_well_times3(const int i, const numtyp m[9],
-                                                  numtyp ans[9])
+__inline void gpu_times3(const numtyp4 shape, const numtyp m[9], 
+                         numtyp ans[9])
 {
-  ans[0] = _well_<numtyp>(i,0)*m[0];
-  ans[1] = _well_<numtyp>(i,0)*m[1];
-  ans[2] = _well_<numtyp>(i,0)*m[2];
-  ans[3] = _well_<numtyp>(i,1)*m[3];
-  ans[4] = _well_<numtyp>(i,1)*m[4];
-  ans[5] = _well_<numtyp>(i,1)*m[5];
-  ans[6] = _well_<numtyp>(i,2)*m[6];
-  ans[7] = _well_<numtyp>(i,2)*m[7];
-  ans[8] = _well_<numtyp>(i,2)*m[8];
-}
-
-/* ----------------------------------------------------------------------
-   diagonal matrix times a full matrix
------------------------------------------------------------------------- */
-
-template <class numtyp>
-static __inline__ __device__ void gpu_shape_times3(const int i, const numtyp m[9],
-                                                   numtyp ans[9])
-{
-  ans[0] = _shape_<numtyp>(i,0)*m[0];
-  ans[1] = _shape_<numtyp>(i,0)*m[1];
-  ans[2] = _shape_<numtyp>(i,0)*m[2];
-  ans[3] = _shape_<numtyp>(i,1)*m[3];
-  ans[4] = _shape_<numtyp>(i,1)*m[4];
-  ans[5] = _shape_<numtyp>(i,1)*m[5];
-  ans[6] = _shape_<numtyp>(i,2)*m[6];
-  ans[7] = _shape_<numtyp>(i,2)*m[7];
-  ans[8] = _shape_<numtyp>(i,2)*m[8];
+  ans[0] = shape.x*m[0];
+  ans[1] = shape.x*m[1];
+  ans[2] = shape.x*m[2];
+  ans[3] = shape.y*m[3];
+  ans[4] = shape.y*m[4];
+  ans[5] = shape.y*m[5];
+  ans[6] = shape.z*m[6];
+  ans[7] = shape.z*m[7];
+  ans[8] = shape.z*m[8];
 }

 /* ----------------------------------------------------------------------
   add two matrices
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ void gpu_plus3(const numtyp m[9], 
-                                            const numtyp m2[9], numtyp ans[9])
+__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
 {
  ans[0] = m[0]+m2[0];
  ans[1] = m[1]+m2[1];
@ -141,10 +132,8 @@ static __inline__ __device__ void gpu_plus3(const numtyp m[9],
   multiply the transpose of mat1 times mat2
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9], 
-                                                       const numtyp m2[9],
-                                                       numtyp ans[9])
+__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
+                                   numtyp ans[9])
 {
  ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
  ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
@ -161,9 +150,7 @@ static __inline__ __device__ void gpu_transpose_times3(const numtyp m[9],
   row vector times matrix
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ void gpu_row_times3(const numtyp *v, 
-                                                 const numtyp m[9], numtyp *ans)
+__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
 {
  ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
  ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
@ -176,10 +163,8 @@ static __inline__ __device__ void gpu_row_times3(const numtyp *v,
   error_flag set to 2 if bad matrix inversion attempted
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ void gpu_mldivide3(const numtyp m[9], 
-                                                const numtyp *v, numtyp *ans,
-                                                int *error_flag)
+__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
+                            __global int *error_flag)
 {
  // create augmented matrix for pivoting

@ -297,12 +282,10 @@ static __inline__ __device__ void gpu_mldivide3(const numtyp m[9],
   quat = [w i j k]
 ------------------------------------------------------------------------- */

-template <class numtyp>
-static __inline__ __device__ void gpu_quat_to_mat_trans(const vec4 *qif,
-                                                        const int qi, 
-                                                        numtyp mat[9])
+__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
+                                    numtyp mat[9])
 {
-  vec4 q=qif[qi];
+  numtyp4 q=qif[qi];
  
  numtyp w2 = q.x*q.x;
  numtyp i2 = q.y*q.y;
--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@ -0,0 +1,383 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef GB_GPU_KERNEL
+#define GB_GPU_KERNEL
+
+#ifdef NV_KERNEL
+#include "gb_gpu_extra.h"
+#endif
+
+__inline void compute_eta_torque(numtyp m[9],numtyp m2[9], const numtyp4 shape, 
+                                 numtyp ans[9])
+{
+  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
+    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
+    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
+  den = (numtyp)1.0/den;
+  
+  ans[0] = shape.x*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
+		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
+		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
+		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
+		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
+  
+  ans[1] = shape.x*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
+		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
+		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
+		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
+		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
+  
+  ans[2] = shape.x*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
+		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
+		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
+		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
+		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
+  
+  ans[3] = shape.y*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
+		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
+		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
+		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
+		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
+  
+  ans[4] = shape.y*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
+		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
+		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
+		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
+		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
+  
+  ans[5] = shape.y*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
+		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
+		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
+		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
+		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
+  
+  ans[6] = shape.z*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
+		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
+		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
+		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
+		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
+  
+  ans[7] = shape.z*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
+		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
+		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
+		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
+		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
+  
+  ans[8] = shape.z*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
+		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
+		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
+		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
+		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
+}
+
+__kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
+                              __global numtyp4* shape, __global numtyp4* well, 
+                              __global numtyp *gum, __global numtyp2* sig_eps, 
+                              const int ntypes, __global numtyp *lshape, 
+                              __global int *dev_nbor, const int stride, 
+                              __global acctyp4 *ans, const int astride, 
+                              __global acctyp *engv, __global int *err_flag, 
+                              const int eflag, const int vflag, const int inum,
+                              const int nall) {
+  __local numtyp sp_lj[4];
+
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  if (ii<4)
+    sp_lj[ii]=gum[ii+3];    
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);                                  
+  __syncthreads();
+
+  if (ii<inum) {
+
+  acctyp energy=(numtyp)0;
+  acctyp4 f;
+  f.x=(numtyp)0;
+  f.y=(numtyp)0;
+  f.z=(numtyp)0;
+  acctyp4 tor;
+  tor.x=(numtyp)0;
+  tor.y=(numtyp)0;
+  tor.z=(numtyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(numtyp)0;
+  
+  __global int *nbor=dev_nbor+ii;
+  int i=*nbor;
+  nbor+=stride;
+  int numj=*nbor;
+  nbor+=stride;
+  __global int *nbor_end=nbor+mul24(stride,numj);
+  
+  numtyp4 ix=x_[i];
+  int itype=ix.w;
+  numtyp a1[9], b1[9], g1[9];
+  numtyp4 ishape=shape[itype];
+  {
+    numtyp t[9];
+    gpu_quat_to_mat_trans(q,i,a1);
+    gpu_times3(ishape,a1,t);
+    gpu_transpose_times3(a1,t,g1);
+    gpu_times3(well[itype],a1,t);
+    gpu_transpose_times3(a1,t,b1);
+  }
+
+  numtyp factor_lj;
+  for ( ; nbor<nbor_end; nbor+=stride) {
+
+  int j=*nbor;
+  if (j < nall) 
+    factor_lj = (numtyp)1.0;
+  else {
+    factor_lj = sp_lj[j/nall];
+    j %= nall;
+  }
+  numtyp4 jx=x_[j];
+  int jtype=jx.w;
+
+  // Compute r12
+  numtyp r12[3];
+  r12[0] = jx.x-ix.x;
+  r12[1] = jx.y-ix.y;
+  r12[2] = jx.z-ix.z;
+  numtyp ir = gpu_dot3(r12,r12);
+
+  ir = rsqrt(ir);
+  numtyp r = (numtyp)1.0/ir;
+
+  numtyp a2[9];
+  gpu_quat_to_mat_trans(q,j,a2);
+  
+  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
+  { // Compute U_r, dUr, eta, and teta
+    // Compute g12
+    numtyp g12[9];
+    {
+      numtyp g2[9];
+      {
+          gpu_times3(shape[jtype],a2,g12);
+          gpu_transpose_times3(a2,g12,g2);
+          gpu_plus3(g1,g2,g12);
+      }
+  
+      { // Compute U_r and dUr
+    
+        // Compute kappa
+        numtyp kappa[3];
+        gpu_mldivide3(g12,r12,kappa,err_flag);
+
+        // -- replace r12 with r12 hat
+        r12[0]*=ir;
+        r12[1]*=ir;
+        r12[2]*=ir;
+
+        // -- kappa is now / r
+        kappa[0]*=ir;
+        kappa[1]*=ir;
+        kappa[2]*=ir;
+  
+        // energy
+  
+        // compute u_r and dUr
+        numtyp uslj_rsq;
+        {
+          // Compute distance of closest approach
+          numtyp h12, sigma12;
+          sigma12 = gpu_dot3(r12,kappa);
+          sigma12 = rsqrt((numtyp)0.5*sigma12);
+          h12 = r-sigma12;
+
+          // -- kappa is now ok
+          kappa[0]*=r;
+          kappa[1]*=r;
+          kappa[2]*=r;
+          
+          int mtype=mul24(ntypes,itype)+jtype;
+          numtyp sigma = sig_eps[mtype].x;
+          numtyp epsilon = sig_eps[mtype].y;
+          numtyp varrho = sigma/(h12+gum[0]*sigma);
+          numtyp varrho6 = varrho*varrho*varrho;
+          varrho6*=varrho6;
+          numtyp varrho12 = varrho6*varrho6;
+          u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+          numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+          temp1 = temp1*(numtyp)24.0*epsilon;
+          uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+          numtyp temp2 = gpu_dot3(kappa,r12);
+          uslj_rsq = uslj_rsq*ir*ir;
+
+          dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
+          dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
+          dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
+        }
+
+        // torque for particle 1
+        {
+          numtyp tempv[3], tempv2[3];
+          tempv[0] = -uslj_rsq*kappa[0];
+          tempv[1] = -uslj_rsq*kappa[1];
+          tempv[2] = -uslj_rsq*kappa[2];
+          gpu_row_times3(kappa,g1,tempv2);
+          gpu_cross3(tempv,tempv2,tUr);
+        }
+      }
+    }
+     
+    // Compute eta
+    {
+      eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+      numtyp det_g12 = gpu_det3(g12);
+      eta = pow(eta/det_g12,gum[1]);
+    }
+    
+    // Compute teta
+    numtyp temp[9], tempv[3], tempv2[3];
+    compute_eta_torque(g12,a1,ishape,temp);
+    numtyp temp1 = -eta*gum[1];
+
+    tempv[0] = temp1*temp[0];
+    tempv[1] = temp1*temp[1];
+    tempv[2] = temp1*temp[2];
+    gpu_cross3(a1,tempv,tempv2);
+    teta[0] = tempv2[0];
+    teta[1] = tempv2[1];
+    teta[2] = tempv2[2];
+  
+    tempv[0] = temp1*temp[3];
+    tempv[1] = temp1*temp[4];
+    tempv[2] = temp1*temp[5];
+    gpu_cross3(a1+3,tempv,tempv2);
+    teta[0] += tempv2[0];
+    teta[1] += tempv2[1];
+    teta[2] += tempv2[2];
+
+    tempv[0] = temp1*temp[6];
+    tempv[1] = temp1*temp[7];
+    tempv[2] = temp1*temp[8];
+    gpu_cross3(a1+6,tempv,tempv2);
+    teta[0] += tempv2[0];
+    teta[1] += tempv2[1];
+    teta[2] += tempv2[2];
+  }
+  
+  numtyp chi, dchi[3], tchi[3];
+  { // Compute chi and dchi
+
+    // Compute b12
+    numtyp b2[9], b12[9];
+    {
+      gpu_times3(well[jtype],a2,b12);
+      gpu_transpose_times3(a2,b12,b2);
+      gpu_plus3(b1,b2,b12);
+    }
+
+    // compute chi_12
+    r12[0]*=r;
+    r12[1]*=r;
+    r12[2]*=r;
+    numtyp iota[3];
+    gpu_mldivide3(b12,r12,iota,err_flag);
+    // -- iota is now iota/r
+    iota[0]*=ir;
+    iota[1]*=ir;
+    iota[2]*=ir;
+    r12[0]*=ir;
+    r12[1]*=ir;
+    r12[2]*=ir;
+    chi = gpu_dot3(r12,iota);
+    chi = pow(chi*(numtyp)2.0,gum[2]);
+
+    // -- iota is now ok
+    iota[0]*=r;
+    iota[1]*=r;
+    iota[2]*=r;
+
+    numtyp temp1 = gpu_dot3(iota,r12);
+    numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
+                                                      gum[2]);
+    dchi[0] = temp2*(iota[0]-temp1*r12[0]);
+    dchi[1] = temp2*(iota[1]-temp1*r12[1]);
+    dchi[2] = temp2*(iota[2]-temp1*r12[2]);
+
+    // compute t_chi
+    numtyp tempv[3];
+    gpu_row_times3(iota,b1,tempv);
+    gpu_cross3(tempv,iota,tchi);
+    temp1 = (numtyp)-4.0*ir*ir;
+    tchi[0] *= temp1;
+    tchi[1] *= temp1;
+    tchi[2] *= temp1;
+  }
+
+  numtyp temp2 = factor_lj*eta*chi;
+  if (eflag>0)
+    energy+=u_r*temp2;
+  numtyp temp1 = -eta*u_r*factor_lj;
+  if (vflag>0) {
+    r12[0]*=-r;
+    r12[1]*=-r;
+    r12[2]*=-r;
+    numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+    f.x+=ft;
+    virial[0]+=r12[0]*ft;
+    ft=temp1*dchi[1]-temp2*dUr[1];
+    f.y+=ft;
+    virial[1]+=r12[1]*ft;
+    virial[3]+=r12[0]*ft;
+    ft=temp1*dchi[2]-temp2*dUr[2];
+    f.z+=ft;
+    virial[2]+=r12[2]*ft;
+    virial[4]+=r12[0]*ft;
+    virial[5]+=r12[1]*ft;
+  } else {
+    f.x+=temp1*dchi[0]-temp2*dUr[0];
+    f.y+=temp1*dchi[1]-temp2*dUr[1];
+    f.z+=temp1*dchi[2]-temp2*dUr[2];
+  }
+
+  // Torque on 1
+  temp1 = -u_r*eta*factor_lj;
+  temp2 = -u_r*chi*factor_lj;
+  numtyp temp3 = -chi*eta*factor_lj;
+  tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
+  tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
+  tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
+
+  } // for nbor
+
+  // Store answers
+  __global acctyp *ap1=engv+ii;
+  if (eflag>0) {
+    *ap1=energy;
+    ap1+=astride;
+  }
+  if (vflag>0) {
+    for (int i=0; i<6; i++) {
+      *ap1=virial[i];
+      ap1+=astride;
+    }
+  }
+  ans[ii]=f;
+  ans[ii+astride]=tor;
+  } // if ii
+}
+
+#endif
+
--- a/lib/gpu/gb_gpu_kernel.h
+++ b/lib/gpu/gb_gpu_kernel.h
@ -1,863 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
-
-#ifndef GB_GPU_KERNEL
-#define GB_GPU_KERNEL
-
-#include "gb_gpu_extra.h"
-
-template <class numtyp>
-static __inline__ __device__ void compute_eta_torque(numtyp m[9], 
-                                                     numtyp m2[9],
-                                                     const int i, 
-                                                     numtyp ans[9])
-{
-  numtyp den = m[3]*m[2]*m[7]-m[0]*m[5]*m[7]-
-    m[2]*m[6]*m[4]+m[1]*m[6]*m[5]-
-    m[3]*m[1]*m[8]+m[0]*m[4]*m[8];
-  den = (numtyp)1.0/den;
-  
-  numtyp shapex=_shape_<numtyp>(i,0);
-  numtyp shapey=_shape_<numtyp>(i,1);
-  numtyp shapez=_shape_<numtyp>(i,2);
-  
-  ans[0] = shapex*(m[5]*m[1]*m2[2]+(numtyp)2.0*m[4]*m[8]*m2[0]-
-		    m[4]*m2[2]*m[2]-(numtyp)2.0*m[5]*m2[0]*m[7]+
-		    m2[1]*m[2]*m[7]-m2[1]*m[1]*m[8]-
-		    m[3]*m[8]*m2[1]+m[6]*m[5]*m2[1]+
-		    m[3]*m2[2]*m[7]-m2[2]*m[6]*m[4])*den;
-  
-  ans[1] = shapex*(m[2]*m2[0]*m[7]-m[8]*m2[0]*m[1]+
-		    (numtyp)2.0*m[0]*m[8]*m2[1]-m[0]*m2[2]*m[5]-
-		    (numtyp)2.0*m[6]*m[2]*m2[1]+m2[2]*m[3]*m[2]-
-		    m[8]*m[3]*m2[0]+m[6]*m2[0]*m[5]+
-		    m[6]*m2[2]*m[1]-m2[2]*m[0]*m[7])*den;
-  
-  ans[2] = shapex*(m[1]*m[5]*m2[0]-m[2]*m2[0]*m[4]-
-		    m[0]*m[5]*m2[1]+m[3]*m[2]*m2[1]-
-		    m2[1]*m[0]*m[7]-m[6]*m[4]*m2[0]+
-		    (numtyp)2.0*m[4]*m[0]*m2[2]-(numtyp)2.0*m[3]*m2[2]*m[1]+
-		    m[3]*m[7]*m2[0]+m[6]*m2[1]*m[1])*den;
-  
-  ans[3] = shapey*(-m[4]*m2[5]*m[2]+(numtyp)2.0*m[4]*m[8]*m2[3]+
-		    m[5]*m[1]*m2[5]-(numtyp)2.0*m[5]*m2[3]*m[7]+
-		    m2[4]*m[2]*m[7]-m2[4]*m[1]*m[8]-
-		    m[3]*m[8]*m2[4]+m[6]*m[5]*m2[4]-
-		    m2[5]*m[6]*m[4]+m[3]*m2[5]*m[7])*den;
-  
-  ans[4] = shapey*(m[2]*m2[3]*m[7]-m[1]*m[8]*m2[3]+
-		    (numtyp)2.0*m[8]*m[0]*m2[4]-m2[5]*m[0]*m[5]-
-		    (numtyp)2.0*m[6]*m2[4]*m[2]-m[3]*m[8]*m2[3]+
-		    m[6]*m[5]*m2[3]+m[3]*m2[5]*m[2]-
-		    m[0]*m2[5]*m[7]+m2[5]*m[1]*m[6])*den;
-  
-  ans[5] = shapey*(m[1]*m[5]*m2[3]-m[2]*m2[3]*m[4]-
-		    m[0]*m[5]*m2[4]+m[3]*m[2]*m2[4]+
-		    (numtyp)2.0*m[4]*m[0]*m2[5]-m[0]*m2[4]*m[7]+
-		    m[1]*m[6]*m2[4]-m2[3]*m[6]*m[4]-
-		    (numtyp)2.0*m[3]*m[1]*m2[5]+m[3]*m2[3]*m[7])*den;
-  
-  ans[6] = shapez*(-m[4]*m[2]*m2[8]+m[1]*m[5]*m2[8]+
-		    (numtyp)2.0*m[4]*m2[6]*m[8]-m[1]*m2[7]*m[8]+
-		    m[2]*m[7]*m2[7]-(numtyp)2.0*m2[6]*m[7]*m[5]-
-		    m[3]*m2[7]*m[8]+m[5]*m[6]*m2[7]-
-		    m[4]*m[6]*m2[8]+m[7]*m[3]*m2[8])*den;
-  
-  ans[7] = shapez*-(m[1]*m[8]*m2[6]-m[2]*m2[6]*m[7]-
-		     (numtyp)2.0*m2[7]*m[0]*m[8]+m[5]*m2[8]*m[0]+
-		     (numtyp)2.0*m2[7]*m[2]*m[6]+m[3]*m2[6]*m[8]-
-		     m[3]*m[2]*m2[8]-m[5]*m[6]*m2[6]+
-		     m[0]*m2[8]*m[7]-m2[8]*m[1]*m[6])*den;
-  
-  ans[8] = shapez*(m[1]*m[5]*m2[6]-m[2]*m2[6]*m[4]-
-		    m[0]*m[5]*m2[7]+m[3]*m[2]*m2[7]-
-		    m[4]*m[6]*m2[6]-m[7]*m2[7]*m[0]+
-		    (numtyp)2.0*m[4]*m2[8]*m[0]+m[7]*m[3]*m2[6]+
-		    m[6]*m[1]*m2[7]-(numtyp)2.0*m2[8]*m[3]*m[1])*den;
-}
-
-#include "gb_gpu_kernel.h"
-
-template<class numtyp, class acctyp>
-__global__ void kernel_gayberne(const vec4* x_, const vec4 *q,
-                                const numtyp *gum, const numtyp *special_lj,
-                                const int *dev_nbor, const size_t nbor_pitch, 
-                                acctyp *ans, size_t ans_pitch, int *err_flag, 
-                                const bool eflag, const bool vflag,
-                                const int inum, const int nall) {
-                                
-  __shared__ numtyp sp_lj[4];
-
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x);                                  
-  __syncthreads();
-
-  if (ii<inum) {
-
-  acctyp energy=(numtyp)0;
-  acctyp fx=(numtyp)0;
-  acctyp fy=(numtyp)0;
-  acctyp fz=(numtyp)0;
-  acctyp torx=(numtyp)0;
-  acctyp tory=(numtyp)0;
-  acctyp torz=(numtyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(numtyp)0;
-  
-  const int *nbor=dev_nbor+ii;
-  int i=*nbor;
-  nbor+=nbor_pitch;
-  nbor+=nbor_pitch;
-  nbor+=nbor_pitch;
-  int numj=*nbor;
-  nbor+=nbor_pitch;
-  const int *nbor_end=nbor+nbor_pitch*numj;
-  
-  vec4 ix=x_[i];
-  int itype=ix.w;
-  numtyp a1[9], b1[9], g1[9];
-  {
-    numtyp t[9];
-    gpu_quat_to_mat_trans(q,i,a1);
-    gpu_shape_times3(itype,a1,t);
-    gpu_transpose_times3(a1,t,g1);
-    gpu_well_times3(itype,a1,t);
-    gpu_transpose_times3(a1,t,b1);
-  }
-
-  numtyp factor_lj;
-  for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-
-  int j=*nbor;
-  if (j < nall) 
-    factor_lj = (numtyp)1.0;
-  else {
-    factor_lj = sp_lj[j/nall];
-    j %= nall;
-  }
-  vec4 jx=x_[j];
-  int jtype=jx.w;
-
-  // Compute r12
-  numtyp r12[3];
-  r12[0] = jx.x-ix.x;
-  r12[1] = jx.y-ix.y;
-  r12[2] = jx.z-ix.z;
-  numtyp ir = gpu_dot3(r12,r12);
-
-  ir = rsqrt(ir);
-  numtyp r = (numtyp)1.0/ir;
-
-  numtyp a2[9];
-  gpu_quat_to_mat_trans(q,j,a2);
-  
-  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
-  { // Compute U_r, dUr, eta, and teta
-    // Compute g12
-    numtyp g12[9];
-    {
-      numtyp g2[9];
-      {
-          gpu_shape_times3(jtype,a2,g12);
-          gpu_transpose_times3(a2,g12,g2);
-          gpu_plus3(g1,g2,g12);
-      }
-  
-      { // Compute U_r and dUr
-    
-        // Compute kappa
-        numtyp kappa[3];
-        gpu_mldivide3(g12,r12,kappa,err_flag);
-
-        // -- replace r12 with r12 hat
-        r12[0]*=ir;
-        r12[1]*=ir;
-        r12[2]*=ir;
-
-        // -- kappa is now / r
-        kappa[0]*=ir;
-        kappa[1]*=ir;
-        kappa[2]*=ir;
-  
-        // energy
-  
-        // compute u_r and dUr
-        numtyp uslj_rsq;
-        {
-          // Compute distance of closest approach
-          numtyp h12, sigma12;
-          sigma12 = gpu_dot3(r12,kappa);
-          sigma12 = rsqrt((numtyp)0.5*sigma12);
-          h12 = r-sigma12;
-
-          // -- kappa is now ok
-          kappa[0]*=r;
-          kappa[1]*=r;
-          kappa[2]*=r;
-          
-          numtyp sigma = _sigma_<numtyp>(itype,jtype);
-          numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
-          numtyp varrho = sigma/(h12+gum[0]*sigma);
-          numtyp varrho6 = varrho*varrho*varrho;
-          varrho6*=varrho6;
-          numtyp varrho12 = varrho6*varrho6;
-          u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-          numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-          temp1 = temp1*(numtyp)24.0*epsilon;
-          uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-          numtyp temp2 = gpu_dot3(kappa,r12);
-          uslj_rsq = uslj_rsq*ir*ir;
-
-          dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
-          dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
-          dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
-        }
-
-        // torque for particle 1
-        {
-          numtyp tempv[3], tempv2[3];
-          tempv[0] = -uslj_rsq*kappa[0];
-          tempv[1] = -uslj_rsq*kappa[1];
-          tempv[2] = -uslj_rsq*kappa[2];
-          gpu_row_times3(kappa,g1,tempv2);
-          gpu_cross3(tempv,tempv2,tUr);
-        }
-      }
-    }
-     
-    // Compute eta
-    {
-      eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
-      numtyp det_g12 = gpu_det3(g12);
-      eta = pow(eta/det_g12,gum[1]);
-    }
-    
-    // Compute teta
-    numtyp temp[9], tempv[3], tempv2[3];
-    compute_eta_torque(g12,a1,itype,temp);
-    numtyp temp1 = -eta*gum[1];
-
-    tempv[0] = temp1*temp[0];
-    tempv[1] = temp1*temp[1];
-    tempv[2] = temp1*temp[2];
-    gpu_cross3(a1,tempv,tempv2);
-    teta[0] = tempv2[0];
-    teta[1] = tempv2[1];
-    teta[2] = tempv2[2];
-  
-    tempv[0] = temp1*temp[3];
-    tempv[1] = temp1*temp[4];
-    tempv[2] = temp1*temp[5];
-    gpu_cross3(a1+3,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
-
-    tempv[0] = temp1*temp[6];
-    tempv[1] = temp1*temp[7];
-    tempv[2] = temp1*temp[8];
-    gpu_cross3(a1+6,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
-  }
-  
-  numtyp chi, dchi[3], tchi[3];
-  { // Compute chi and dchi
-
-    // Compute b12
-    numtyp b2[9], b12[9];
-    {
-      gpu_well_times3(jtype,a2,b12);
-      gpu_transpose_times3(a2,b12,b2);
-      gpu_plus3(b1,b2,b12);
-    }
-
-    // compute chi_12
-    r12[0]*=r;
-    r12[1]*=r;
-    r12[2]*=r;
-    numtyp iota[3];
-    gpu_mldivide3(b12,r12,iota,err_flag);
-    // -- iota is now iota/r
-    iota[0]*=ir;
-    iota[1]*=ir;
-    iota[2]*=ir;
-    r12[0]*=ir;
-    r12[1]*=ir;
-    r12[2]*=ir;
-    chi = gpu_dot3(r12,iota);
-    chi = pow(chi*(numtyp)2.0,gum[2]);
-
-    // -- iota is now ok
-    iota[0]*=r;
-    iota[1]*=r;
-    iota[2]*=r;
-
-    numtyp temp1 = gpu_dot3(iota,r12);
-    numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
-    dchi[0] = temp2*(iota[0]-temp1*r12[0]);
-    dchi[1] = temp2*(iota[1]-temp1*r12[1]);
-    dchi[2] = temp2*(iota[2]-temp1*r12[2]);
-
-    // compute t_chi
-    numtyp tempv[3];
-    gpu_row_times3(iota,b1,tempv);
-    gpu_cross3(tempv,iota,tchi);
-    temp1 = (numtyp)-4.0*ir*ir;
-    tchi[0] *= temp1;
-    tchi[1] *= temp1;
-    tchi[2] *= temp1;
-  }
-
-  numtyp temp2 = factor_lj*eta*chi;
-  if (eflag)
-    energy+=u_r*temp2;
-  numtyp temp1 = -eta*u_r*factor_lj;
-  if (vflag) {
-    r12[0]*=-r;
-    r12[1]*=-r;
-    r12[2]*=-r;
-    numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-    fx+=ft;
-    virial[0]+=r12[0]*ft;
-    ft=temp1*dchi[1]-temp2*dUr[1];
-    fy+=ft;
-    virial[1]+=r12[1]*ft;
-    virial[3]+=r12[0]*ft;
-    ft=temp1*dchi[2]-temp2*dUr[2];
-    fz+=ft;
-    virial[2]+=r12[2]*ft;
-    virial[4]+=r12[0]*ft;
-    virial[5]+=r12[1]*ft;
-  } else {
-    fx+=temp1*dchi[0]-temp2*dUr[0];
-    fy+=temp1*dchi[1]-temp2*dUr[1];
-    fz+=temp1*dchi[2]-temp2*dUr[2];
-  }
-
-  // Torque on 1
-  temp1 = -u_r*eta*factor_lj;
-  temp2 = -u_r*chi*factor_lj;
-  numtyp temp3 = -chi*eta*factor_lj;
-  torx+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
-  tory+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
-  torz+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
-
-  } // for nbor
-
-  // Store answers
-  acctyp *ap1=ans+ii*ans_pitch;
-  if (eflag) {
-    *ap1=energy;
-    ap1++;
-  }
-  if (vflag) {
-    for (int i=0; i<6; i++) {
-      *ap1=virial[i];
-      ap1++;
-    }
-  }
-  *ap1=fx;
-  ap1++;
-  *ap1=fy;
-  ap1++;
-  *ap1=fz;
-  ap1++;
-  *ap1=torx;
-  ap1++;
-  *ap1=tory;
-  ap1++;
-  *ap1=torz;
-
-  } // if ii
-
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_sphere_gb(const vec4 *x_, const vec4 *q,
-                                 const numtyp *gum, const numtyp *special_lj,
-                                 const int *dev_nbor, const size_t nbor_pitch,
-                                 acctyp *ans, size_t ans_pitch, int *err_flag, 
-                                 const bool eflag, const bool vflag,
-                                 const int start, const int inum, 
-                                 const int nall) {
-  __shared__ numtyp sp_lj[4];
-
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-  __syncthreads();
-
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    nbor+=nbor_pitch;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *nbor_end=nbor+nbor_pitch*numj;
-  
-    vec4 ix=x_[i];
-    int itype=ix.w;
-      
-    numtyp oner=_shape_<numtyp>(itype,0);
-    numtyp one_well=_well_<numtyp>(itype,0);
-  
-    numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
-  
-      int j=*nbor;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      vec4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp r12[3];
-      r12[0] = jx.x-ix.x;
-      r12[1] = jx.y-ix.y;
-      r12[2] = jx.z-ix.z;
-      numtyp ir = gpu_dot3(r12,r12);
-
-      ir = rsqrt(ir);
-      numtyp r = (numtyp)1.0/ir;
-      
-      numtyp r12hat[3];
-      r12hat[0]=r12[0]*ir;
-      r12hat[1]=r12[1]*ir;
-      r12hat[2]=r12[2]*ir;
-
-      numtyp a2[9];
-      gpu_quat_to_mat_trans(q,j,a2);
-  
-      numtyp u_r, dUr[3], eta;
-      { // Compute U_r, dUr, eta, and teta
-        // Compute g12
-        numtyp g12[9];
-        {
-          {
-            numtyp g2[9];
-            gpu_shape_times3(jtype,a2,g12);
-            gpu_transpose_times3(a2,g12,g2);
-            g12[0]=g2[0]+oner;
-            g12[4]=g2[4]+oner;
-            g12[8]=g2[8]+oner;
-            g12[1]=g2[1];
-            g12[2]=g2[2];
-            g12[3]=g2[3];
-            g12[5]=g2[5];
-            g12[6]=g2[6];
-            g12[7]=g2[7];    
-          }
-  
-          { // Compute U_r and dUr
-    
-            // Compute kappa
-            numtyp kappa[3];
-            gpu_mldivide3(g12,r12,kappa,err_flag);
-
-            // -- kappa is now / r
-            kappa[0]*=ir;
-            kappa[1]*=ir;
-            kappa[2]*=ir;
-  
-            // energy
-  
-            // compute u_r and dUr
-            numtyp uslj_rsq;
-            {
-              // Compute distance of closest approach
-              numtyp h12, sigma12;
-              sigma12 = gpu_dot3(r12hat,kappa);
-              sigma12 = rsqrt((numtyp)0.5*sigma12);
-              h12 = r-sigma12;
-
-              // -- kappa is now ok
-              kappa[0]*=r;
-              kappa[1]*=r;
-              kappa[2]*=r;
-          
-              numtyp sigma = _sigma_<numtyp>(itype,jtype);
-              numtyp epsilon = _epsilon_<numtyp>(itype,jtype);
-              numtyp varrho = sigma/(h12+gum[0]*sigma);
-              numtyp varrho6 = varrho*varrho*varrho;
-              varrho6*=varrho6;
-              numtyp varrho12 = varrho6*varrho6;
-              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-              temp1 = temp1*(numtyp)24.0*epsilon;
-              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-              numtyp temp2 = gpu_dot3(kappa,r12hat);
-              uslj_rsq = uslj_rsq*ir*ir;
-
-              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
-              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
-              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
-            }
-          }
-        }
-     
-        // Compute eta
-        {
-          eta = (numtyp)2.0*_lshape_<numtyp>(itype)*_lshape_<numtyp>(jtype);
-          numtyp det_g12 = gpu_det3(g12);
-          eta = pow(eta/det_g12,gum[1]);
-        }
-      }
-  
-      numtyp chi, dchi[3];
-      { // Compute chi and dchi
-
-        // Compute b12
-        numtyp b12[9];
-        {
-          numtyp b2[9];
-          gpu_well_times3(jtype,a2,b12);
-          gpu_transpose_times3(a2,b12,b2);
-          b12[0]=b2[0]+one_well;
-          b12[4]=b2[4]+one_well;
-          b12[8]=b2[8]+one_well;
-          b12[1]=b2[1];
-          b12[2]=b2[2];
-          b12[3]=b2[3];
-          b12[5]=b2[5];
-          b12[6]=b2[6];
-          b12[7]=b2[7];    
-        }
-
-        // compute chi_12
-        numtyp iota[3];
-        gpu_mldivide3(b12,r12,iota,err_flag);
-        // -- iota is now iota/r
-        iota[0]*=ir;
-        iota[1]*=ir;
-        iota[2]*=ir;
-        chi = gpu_dot3(r12hat,iota);
-        chi = pow(chi*(numtyp)2.0,gum[2]);
-
-        // -- iota is now ok
-        iota[0]*=r;
-        iota[1]*=r;
-        iota[2]*=r;
-
-        numtyp temp1 = gpu_dot3(iota,r12hat);
-        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
-        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
-        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
-        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
-      }
-
-      numtyp temp2 = factor_lj*eta*chi;
-      if (eflag)
-        energy+=u_r*temp2;
-      numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag) {
-        r12[0]*=-1;
-        r12[1]*=-1;
-        r12[2]*=-1;
-        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-        fx+=ft;
-        virial[0]+=r12[0]*ft;
-        ft=temp1*dchi[1]-temp2*dUr[1];
-        fy+=ft;
-        virial[1]+=r12[1]*ft;
-        virial[3]+=r12[0]*ft;
-        ft=temp1*dchi[2]-temp2*dUr[2];
-        fz+=ft;
-        virial[2]+=r12[2]*ft;
-        virial[4]+=r12[0]*ft;
-        virial[5]+=r12[1]*ft;
-      } else {
-        fx+=temp1*dchi[0]-temp2*dUr[0];
-        fy+=temp1*dchi[1]-temp2*dUr[1];
-        fz+=temp1*dchi[2]-temp2*dUr[2];
-      }
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii*ans_pitch;
-    if (eflag) {
-      *ap1=energy;
-      ap1++;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1=virial[i];
-        ap1++;
-      }
-    }
-    *ap1=fx;
-    ap1++;
-    *ap1=fy;
-    ap1++;
-    *ap1=fz;
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj(const vec4 *x_,
-                          const numtyp *special_lj, const int *dev_nbor, 
-                          const size_t nbor_pitch, const int *dev_ij, acctyp *ans, 
-                          size_t ans_pitch, int *err_flag, const bool eflag, 
-                          const bool vflag, const int start, const int inum,
-                          const int nall) {
-  __shared__ numtyp sp_lj[4];                              
-  
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-  __syncthreads();
-
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    vec4 ix=x_[i];
-    int itype=ix.w;
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      vec4 jx=x_[j];
-      int jtype=jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<_cutsq_<numtyp>(itype,jtype) &&
-          _form_(itype,jtype)==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = r2inv*r6inv*(_lj1_<numtyp>(itype,jtype).x*r6inv-
-                                    _lj1_<numtyp>(itype,jtype).y);
-        force*=factor_lj;
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(_lj3_<numtyp>(itype,jtype).x*r6inv-
-                          _lj3_<numtyp>(itype,jtype).y);
-          energy+=factor_lj*(e-_offset_<numtyp>(1,1)); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii*ans_pitch;
-    if (eflag) {
-      *ap1+=energy;
-      ap1++;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1++;
-      }
-    }
-    *ap1+=fx;
-    ap1++;
-    *ap1+=fy;
-    ap1++;
-    *ap1+=fz;
-
-  } // if ii
-}
-
-template<class numtyp, class acctyp>
-__global__ void kernel_lj_fast(const vec4 *x_,
-                               const numtyp *special_lj, const int *dev_nbor, 
-                               const size_t nbor_pitch, const int *dev_ij, 
-                               acctyp *ans, size_t ans_pitch,int *err_flag,
-                               const bool eflag, const bool vflag, 
-                               const int start, const int inum, const int nall){
-  // ii indexes the two interacting particles in gi
-  int ii=threadIdx.x;
-  __shared__ numtyp sp_lj[4];                              
-  __shared__ int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp lj4[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ numtyp offset[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
-    sp_lj[ii]=special_lj[ii];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    int itype=ii/MAX_SHARED_TYPES;
-    int jtype=ii%MAX_SHARED_TYPES;
-    cutsq[ii]=_cutsq_<numtyp>(itype,jtype);
-    form[ii]=_form_(itype,jtype);
-    lj1[ii]=_lj1_<numtyp>(itype,jtype).x;
-    lj2[ii]=_lj1_<numtyp>(itype,jtype).y;
-    if (eflag) {
-      lj3[ii]=_lj3_<numtyp>(itype,jtype).x;
-      lj4[ii]=_lj3_<numtyp>(itype,jtype).y;
-      offset[ii]=_offset_<numtyp>(itype,jtype);
-    }
-  }
-  ii+=INT_MUL(blockIdx.x,blockDim.x)+start;
-  __syncthreads();
-  
-  if (ii<inum) {
-  
-    acctyp energy=(numtyp)0;
-    acctyp fx=(numtyp)0;
-    acctyp fy=(numtyp)0;
-    acctyp fz=(numtyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(numtyp)0;
-  
-    const int *nbor=dev_nbor+ii;
-    int i=*nbor;
-    nbor+=nbor_pitch;
-    int numj=*nbor;
-    nbor+=nbor_pitch;
-    const int *list=dev_ij+*nbor;
-    const int *list_end=list+numj;
-  
-    vec4 ix=x_[i];
-    int itype=INT_MUL(MAX_SHARED_TYPES,ix.w);
-
-    numtyp factor_lj;
-    for ( ; list<list_end; list++) {
-  
-      int j=*list;
-      if (j < nall) 
-        factor_lj = (numtyp)1.0;
-      else {
-        factor_lj = sp_lj[j/nall];
-        j %= nall;
-      }
-      vec4 jx=x_[j];
-      int mtype=itype+jx.w;
-
-      // Compute r12
-      numtyp delx = ix.x-jx.x;
-      numtyp dely = ix.y-jx.y;
-      numtyp delz = ix.z-jx.z;
-      numtyp r2inv = delx*delx+dely*dely+delz*delz;
-        
-      if (r2inv<cutsq[mtype] && form[mtype]==SPHERE_SPHERE) {
-        r2inv=(numtyp)1.0/r2inv;
-        numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype]*r6inv-lj2[mtype]);
-      
-        fx+=delx*force;
-        fy+=dely*force;
-        fz+=delz*force;
-
-        if (eflag) {
-          numtyp e=r6inv*(lj3[mtype]*r6inv-lj4[mtype]);
-          energy+=factor_lj*(e-offset[mtype]); 
-        }
-        if (vflag) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
-      }
-
-    } // for nbor
-
-    // Store answers
-    acctyp *ap1=ans+ii*ans_pitch;
-    if (eflag) {
-      *ap1+=energy;
-      ap1++;
-    }
-    if (vflag) {
-      for (int i=0; i<6; i++) {
-        *ap1+=virial[i];
-        ap1++;
-      }
-    }
-    *ap1+=fx;
-    ap1++;
-    *ap1+=fy;
-    ap1++;
-    *ap1+=fz;
-
-  } // if ii
-}
-
-#endif
--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@ -0,0 +1,472 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef GB_GPU_KERNEL_LJ
+#define GB_GPU_KERNEL_LJ
+
+#ifdef NV_KERNEL
+#include "gb_gpu_extra.h"
+#endif
+
+__kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
+                               __global numtyp4* shape,__global numtyp4* well, 
+                               __global numtyp *gum, __global numtyp2* sig_eps, 
+                               const int ntypes, __global numtyp *lshape, 
+                               __global int *dev_nbor, const int stride, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               __global int *err_flag, const int eflag, 
+                               const int vflag,const int start, const int inum, 
+                               const int nall) {
+  __local numtyp sp_lj[4];
+
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  if (ii<4)
+    sp_lj[ii]=gum[ii+3];    
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
+  __syncthreads();
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+stride*numj;
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+      
+    numtyp oner=shape[itype].x;
+    numtyp one_well=well[itype].x;
+  
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=stride) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = rsqrt(ir);
+      numtyp r = (numtyp)1.0/ir;
+      
+      numtyp r12hat[3];
+      r12hat[0]=r12[0]*ir;
+      r12hat[1]=r12[1]*ir;
+      r12hat[2]=r12[2]*ir;
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(q,j,a2);
+  
+      numtyp u_r, dUr[3], eta;
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          {
+            numtyp g2[9];
+            gpu_times3(shape[jtype],a2,g12);
+            gpu_transpose_times3(a2,g12,g2);
+            g12[0]=g2[0]+oner;
+            g12[4]=g2[4]+oner;
+            g12[8]=g2[8]+oner;
+            g12[1]=g2[1];
+            g12[2]=g2[2];
+            g12[3]=g2[3];
+            g12[5]=g2[5];
+            g12[6]=g2[6];
+            g12[7]=g2[7];    
+          }
+  
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+  
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12hat,kappa);
+              sigma12 = rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              int mtype=mul24(ntypes,itype)+jtype;
+              numtyp sigma = sig_eps[mtype].x;
+              numtyp epsilon = sig_eps[mtype].y;
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12hat);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
+              dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
+              dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+          numtyp det_g12 = gpu_det3(g12);
+          eta = pow(eta/det_g12,gum[1]);
+        }
+      }
+  
+      numtyp chi, dchi[3];
+      { // Compute chi and dchi
+
+        // Compute b12
+        numtyp b12[9];
+        {
+          numtyp b2[9];
+          gpu_times3(well[jtype],a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          b12[0]=b2[0]+one_well;
+          b12[4]=b2[4]+one_well;
+          b12[8]=b2[8]+one_well;
+          b12[1]=b2[1];
+          b12[2]=b2[2];
+          b12[3]=b2[3];
+          b12[5]=b2[5];
+          b12[6]=b2[6];
+          b12[7]=b2[7];    
+        }
+
+        // compute chi_12
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
+        chi = gpu_dot3(r12hat,iota);
+        chi = pow(chi*(numtyp)2.0,gum[2]);
+
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12hat);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag>0)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag>0) {
+        r12[0]*=-1;
+        r12[1]*=-1;
+        r12[2]*=-1;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        f.x+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        f.y+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        f.z+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        f.x+=temp1*dchi[0]-temp2*dUr[0];
+        f.y+=temp1*dchi[1]-temp2*dUr[1];
+        f.z+=temp1*dchi[2]-temp2*dUr[2];
+      }
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1, 
+                        __global numtyp4* lj3, const int lj_types, 
+                        __global numtyp *gum, 
+                        const int stride, __global int *dev_ij, 
+                        __global acctyp4 *ans, __global acctyp *engv, 
+                        __global int *err_flag, const int eflag, 
+                        const int vflag, const int start, const int inum, 
+                        const int nall) {
+  __local numtyp sp_lj[4];                              
+  
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  if (ii<4)
+    sp_lj[ii]=gum[ii+3];    
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
+  __syncthreads();
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *list_end=nbor+mul24(stride,numj);
+  
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=stride) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int ii=itype*lj_types+jtype;
+      if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
+          energy+=factor_lj*(e-lj3[ii].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1+=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=inum;
+      }
+    }
+    acctyp4 old=ans[ii];
+    old.x+=f.x;
+    old.y+=f.y;
+    old.z+=f.z;
+    ans[ii]=old;
+  } // if ii
+}
+
+__kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
+                             __global numtyp4* lj3_in, __global numtyp *gum, 
+                             const int stride, 
+                             __global int *dev_ij, __global acctyp4 *ans, 
+                             __global acctyp *engv, __global int *err_flag,
+                             const int eflag,const int vflag, const int start,
+                             const int inum, const int nall) {
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  __local numtyp sp_lj[4];                              
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<4)
+    sp_lj[ii]=gum[ii+3];    
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[ii]=lj1_in[ii];
+    if (eflag>0)
+      lj3[ii]=lj3_in[ii];
+  }
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
+  __syncthreads();
+  
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *list_end=nbor+mul24(stride,numj);
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=stride) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1+=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1+=virial[i];
+        ap1+=inum;
+      }
+    }
+    acctyp4 old=ans[ii];
+    old.x+=f.x;
+    old.y+=f.y;
+    old.z+=f.z;
+    ans[ii]=old;
+  } // if ii
+}
+
+#endif
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_KERNEL_H
+#define PAIR_GPU_KERNEL_H
+
+#define MAX_SHARED_TYPES 8
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#else
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+
+#endif
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// ---------------------------------------------------------------------------
+__kernel void kernel_gb_nbor(__global numtyp4 *x_, __global numtyp2 *cut_form, 
+                             const int ntypes, __global int *dev_nbor,
+                             const int nbor_pitch, 
+                             const int start, const int inum, 
+                             __global int *dev_ij, const int form_low, 
+                             const int form_high, const int nall) {
+                                
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X+start;
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24(iw,ntypes);
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      if (j>=nall)
+        j%=nall;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      numtyp2 cf=cut_form[mtype];
+      if (cf.y>=form_low && cf.y<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cf.x) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Unpack neighbors from dev_ij array into dev_nbor matrix for coalesced access
+// -- Only unpack neighbors matching the specified inclusive range of forms
+// -- Only unpack neighbors within cutoff
+// -- Fast version of routine that uses shared memory for LJ constants
+// ---------------------------------------------------------------------------
+__kernel void kernel_gb_nbor_fast(__global numtyp4 *x_, 
+                                  __global numtyp2 *cut_form,
+                                  __global int *dev_nbor, 
+                                  const int nbor_pitch, 
+                                  const int start, const int inum, 
+                                  __global int *dev_ij, const int form_low, 
+                                  const int form_high, const int nall) {
+                                
+  int ii=THREAD_ID_X;
+  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    cutsq[ii]=cut_form[ii].x;
+    form[ii]=cut_form[ii].y;
+  }
+  ii+=mul24((int)BLOCK_SIZE_X,(int)BLOCK_ID_X)+start;
+  __syncthreads();
+
+  if (ii<inum) {
+    __global int *nbor=dev_ij+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+    __global int *packed=dev_nbor+ii+nbor_pitch+nbor_pitch;
+  
+    numtyp4 ix=x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    int newj=0;  
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+      int j=*nbor;
+      if (j>=nall)
+        j%=nall;
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+      int mtype=itype+jtype;
+      
+      if (form[mtype]>=form_low && form[mtype]<=form_high) {
+        // Compute r12;
+        numtyp rsq=jx.x-ix.x;
+        rsq*=rsq;
+        numtyp t=jx.y-ix.y;
+        rsq+=t*t;
+        t=jx.z-ix.z;
+        rsq+=t*t;
+
+        if (rsq<cutsq[mtype]) {
+          *packed=j;
+          packed+=nbor_pitch;
+          newj++;
+        }
+      }
+    }
+    dev_nbor[ii+nbor_pitch]=newj;
+  }
+}
+
+#endif
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@ -0,0 +1,334 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "gb_gpu_cl.h"
+#include "gb_gpu_nbor_cl.h"
+#else
+#include "gb_gpu_ptx.h"
+#endif
+
+#include "gb_gpu_memory.h"
+#include <cassert>
+#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
+                                  _max_bytes(0.0) {
+  device=&pair_gpu_device;
+}
+
+template <class numtyp, class acctyp>
+GB_GPU_MemoryT::~GB_GPU_Memory() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
+                          const double upsilon, const double mu, 
+                          double **host_shape, double **host_well, 
+                          double **host_cutsq, double **host_sigma, 
+                          double **host_epsilon, double *host_lshape, 
+                          int **h_form, double **host_lj1, double **host_lj2,
+                          double **host_lj3, double **host_lj4,
+                          double **host_offset, const double *host_special_lj,
+                          const int nlocal, const int nall,
+                          const int max_nbors, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  nbor_time_avail=false;
+  screen=_screen;
+
+  bool gpu_nbor=false;
+  if (device->gpu_mode()==PairGPUDevice<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=true;
+
+  int _gpu_host=0;
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  if (host_nlocal>0)
+    _gpu_host=1;
+  
+  if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
+                    max_nbors,cell_size,true))
+    return false;
+  ucl_device=device->gpu;
+  atom=&device->atom;
+  nbor=&device->nbor;
+
+  _block_size=BLOCK_1D;
+  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
+    _block_size=ucl_device->group_size();
+  compile_kernels(*ucl_device);
+
+  // Initialize host-device load balancer
+  hd_balancer.init(device,gpu_split);
+
+  // Initialize timers for the selected GPU
+  time_pair.init(*ucl_device);
+  time_pair.zero();
+  
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
+    lj_types=MAX_SHARED_TYPES;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for copying type data
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*ucl_device,
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  sigma_epsilon.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,lj_types,sigma_epsilon,host_write,
+			 host_sigma,host_epsilon);
+
+  cut_form.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,lj_types,cut_form,host_write,
+			 host_cutsq,h_form);
+
+  lj1.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq,h_form);
+
+  lj3.alloc(lj_types*lj_types,*ucl_device,UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  dev_error.alloc(1,*ucl_device);
+  dev_error.zero();
+    
+  _allocated=true;
+    
+  host_form=h_form;
+    
+  // Initialize timers for the selected GPU
+  time_kernel.init(*ucl_device);
+  time_gayberne.init(*ucl_device);
+  time_kernel2.init(*ucl_device);
+  time_gayberne2.init(*ucl_device);
+  time_kernel.zero();
+  time_gayberne.zero();
+  time_kernel2.zero();
+  time_gayberne2.zero();
+    
+  // Allocate, cast and asynchronous memcpy of constant data
+  // Copy data for bonded interactions
+  gamma_upsilon_mu.alloc(7,*ucl_device,UCL_READ_ONLY);
+  host_write[0]=static_cast<numtyp>(gamma); 
+  host_write[1]=static_cast<numtyp>(upsilon);
+  host_write[2]=static_cast<numtyp>(mu);
+  host_write[3]=static_cast<numtyp>(host_special_lj[0]);
+  host_write[4]=static_cast<numtyp>(host_special_lj[1]);
+  host_write[5]=static_cast<numtyp>(host_special_lj[2]);
+  host_write[6]=static_cast<numtyp>(host_special_lj[3]);
+  ucl_copy(gamma_upsilon_mu,host_write,7,false);
+
+  lshape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
+  UCL_H_Vec<double> d_view;
+  d_view.view(host_lshape,lshape.numel(),*ucl_device);
+  ucl_copy(lshape,d_view,false);
+    
+  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
+  // - cast if necessary
+  shape.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++) {
+    host_write[i*4]=host_shape[i][0];
+    host_write[i*4+1]=host_shape[i][1];
+    host_write[i*4+2]=host_shape[i][2];
+  }
+  UCL_H_Vec<numtyp4> view4;
+  view4.view((numtyp4*)host_write.begin(),shape.numel(),*ucl_device);
+  ucl_copy(shape,view4,false);
+
+  well.alloc(ntypes,*ucl_device,UCL_READ_ONLY);
+  for (int i=0; i<ntypes; i++) {
+    host_write[i*4]=host_well[i][0];
+    host_write[i*4+1]=host_well[i][1];
+    host_write[i*4+2]=host_well[i][2];
+  }
+  view4.view((numtyp4*)host_write.begin(),well.numel(),*ucl_device);
+  ucl_copy(well,view4,false);
+  
+  // See if we want fast GB-sphere or sphere-sphere calculations
+  multiple_forms=false;
+  for (int i=1; i<ntypes; i++)
+    for (int j=i; j<ntypes; j++) 
+      if (host_form[i][j]!=ELLIPSE_ELLIPSE)
+        multiple_forms=true;
+  if (multiple_forms && host_nlocal>0) {
+    std::cerr << "Cannot use Gayberne with multiple forms and GPU neighbor.\n";
+    exit(1);
+  }
+  
+  if (multiple_forms)
+    atom->dev_ans.zero();
+
+  _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+
+  // Memory for ilist ordered by particle type
+  return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
+}
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::clear() {
+  if (!_allocated)
+    return;
+
+  UCL_H_Vec<int> err_flag(1,*ucl_device);
+  ucl_copy(err_flag,dev_error,false);
+  if (err_flag[0] == 2)
+    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
+  err_flag.clear();
+
+  _allocated=false;
+
+  // Output any timing information
+  acc_timers();
+  double single[6], times[6];
+
+  single[0]=atom->transfer_time();
+  single[1]=nbor->time_nbor.total_seconds();
+  single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
+            nbor->time_kernel.total_seconds();
+  single[3]=time_gayberne.total_seconds()+time_gayberne2.total_seconds();
+  if (multiple_forms)
+    single[4]=time_pair.total_seconds();
+  else
+    single[4]=0;
+  single[5]=atom->cast_time();
+
+  MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD);
+  double avg_split=hd_balancer.all_avg_split();
+
+  _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
+              sigma_epsilon.row_bytes()+cut_form.row_bytes()+
+              shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
+              gamma_upsilon_mu.row_bytes();
+  double mpi_max_bytes;
+  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD);
+  double max_mb=mpi_max_bytes/(1024*1024);
+
+  if (device->world_me()==0)
+    if (screen && times[3]>0.0) {
+      int world_size=device->world_size();
+
+      fprintf(screen,"\n\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+
+      if (device->procs_per_gpu()==1) {
+        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/world_size);
+        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/world_size);
+        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/world_size);
+        if (nbor->gpu_nbor())
+          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/world_size);
+        else
+          fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/world_size);
+        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/world_size);
+        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/world_size);
+      }
+      fprintf(screen,"Average split:   %.4f.\n",avg_split);
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      fprintf(screen,"-------------------------------------");
+      fprintf(screen,"--------------------------------\n\n");
+    }
+  _max_bytes=0.0;
+
+  dev_error.clear();
+  lj1.clear();
+  lj3.clear();
+  sigma_epsilon.clear();
+  cut_form.clear();
+
+  shape.clear();
+  well.clear();
+  lshape.clear();
+  gamma_upsilon_mu.clear();
+  host_olist.clear();
+
+  time_kernel.clear();
+  time_gayberne.clear();
+  time_kernel2.clear();
+  time_gayberne2.clear();
+  time_pair.clear();
+  hd_balancer.clear();
+
+  if (_compiled) {
+    k_gb_nbor_fast.clear();
+    k_gb_nbor.clear();
+    k_gayberne.clear();
+    k_sphere_gb.clear();
+    k_lj_fast.clear();
+    k_lj.clear();
+    delete pair_program;
+    delete gb_program;
+    delete gb_lj_program;
+    _compiled=false;
+  }
+
+  device->clear();
+}
+
+template <class numtyp, class acctyp>
+double GB_GPU_MemoryT::host_memory_usage() const {
+  return device->atom.host_memory_usage()+
+         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
+         sizeof(GB_GPU_Memory<numtyp,acctyp>)+
+         device->nbor.max_atoms()*sizeof(int);
+}
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::compile_kernels(UCL_Device &dev) {
+  if (_compiled)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE);
+
+  pair_program=new UCL_Program(dev);
+  pair_program->load_string(gb_gpu_kernel_nbor,flags.c_str());
+  k_gb_nbor_fast.set_function(*pair_program,"kernel_gb_nbor_fast");
+  k_gb_nbor.set_function(*pair_program,"kernel_gb_nbor");
+
+  gb_program=new UCL_Program(dev);
+  gb_program->load_string(gb_gpu_kernel,flags.c_str());
+  k_gayberne.set_function(*gb_program,"kernel_gayberne");
+
+  gb_lj_program=new UCL_Program(dev);
+  gb_lj_program->load_string(gb_gpu_kernel_lj,flags.c_str());
+  k_sphere_gb.set_function(*gb_lj_program,"kernel_sphere_gb");
+  k_lj_fast.set_function(*gb_lj_program,"kernel_lj_fast");
+  k_lj.set_function(*gb_lj_program,"kernel_lj");
+
+  _compiled=true;
+}
+
+template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/gb_gpu_memory.cu
+++ b/lib/gpu/gb_gpu_memory.cu
@ -1,156 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
-
-#include "gb_gpu_memory.h"
-#define GB_GPU_MemoryT GB_GPU_Memory<numtyp, acctyp>
-
-template <class numtyp, class acctyp>
-GB_GPU_MemoryT::GB_GPU_Memory() : LJ_GPU_MemoryT() {
-  this->atom.atom_fields(8); 
-  this->atom.ans_fields(13); 
-  this->nbor.packing(true);
-}
-
-template <class numtyp, class acctyp>
-GB_GPU_MemoryT::~GB_GPU_Memory() { 
-  clear(); 
-}
- 
-template <class numtyp, class acctyp>
-bool GB_GPU_MemoryT::init(const int ij_size, const int ntypes, 
-                          const double gamma, const double upsilon, 
-                          const double mu, double **host_shape,
-                          double **host_well, double **host_cutsq, 
-                          double **host_sigma, double **host_epsilon, 
-                          double *host_lshape, int **h_form, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int nlocal,
-                          const int nall, const int max_nbors, 
-                          const bool force_d, const int me) {
-  _max_nbors=max_nbors;
-  if (this->allocated)
-    clear();
-    
-  bool p=LJ_GPU_MemoryT::init(ij_size,ntypes,host_cutsq,host_sigma,host_epsilon,
-                              host_lj1, host_lj2, host_lj3, host_lj4, 
-                              host_offset, host_special_lj, max_nbors, me,
-                              nlocal, nall);
-  if (!p)
-    return false;                              
-    
-  host_form=h_form;
-    
-  // Initialize timers for the selected GPU
-  time_kernel.init();
-  time_gayberne.init();
-  time_kernel2.init();
-  time_gayberne2.init();
-    
-  // Use the write buffer from atom for data initialization
-  NVC_HostT &host_write=this->atom.host_write;
-  assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
-
-  // Allocate, cast and asynchronous memcpy of constant data
-  gamma_upsilon_mu.safe_alloc(3);
-  host_write[0]=static_cast<numtyp>(gamma); 
-  host_write[1]=static_cast<numtyp>(upsilon);
-  host_write[2]=static_cast<numtyp>(mu);
-  gamma_upsilon_mu.copy_from_host(host_write.begin());
-
-  lshape.safe_alloc(ntypes,lshape_get_texture<numtyp>());
-  lshape.cast_copy(host_lshape,host_write);
-  lshape.copy_from_host(host_write.begin());
-    
-  // Copy shape, well, sigma, epsilon, and cutsq onto GPU
-  shape.safe_alloc(ntypes,3,shape_get_texture<numtyp>());
-  shape.cast_copy(host_shape[0],host_write);
-  well.safe_alloc(ntypes,3,well_get_texture<numtyp>());
-  well.cast_copy(host_well[0],host_write);
-
-  // Copy LJ data onto GPU
-  int lj_types=ntypes;
-  if (lj_types<=MAX_SHARED_TYPES)
-    lj_types=MAX_SHARED_TYPES;
-  form.safe_alloc(lj_types,lj_types,form_get_texture());
-  form.copy_2Dfrom_host(host_form[0],ntypes,ntypes);
-
-  // See if we want fast GB-sphere or sphere-sphere calculations
-  multiple_forms=false;
-  for (int i=1; i<ntypes; i++)
-    for (int j=i; j<ntypes; j++) 
-      if (host_form[i][j]!=ELLIPSE_ELLIPSE)
-        multiple_forms=true;
-        
-  // Memory for ilist ordered by particle type
-  return host_olist.alloc_rw(this->max_local);
-}
-
-template <class numtyp, class acctyp>
-void GB_GPU_MemoryT::resize_atom(const int nall, bool &success) {
-  this->max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-  this->atom.resize(this->max_atoms, success);
-}
-
-template <class numtyp, class acctyp>
-void GB_GPU_MemoryT::resize_local(const int nlocal, const int max_nbors,
-                                  bool &success) {
-  if (nlocal>this->max_local) {
-    this->max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
-    host_olist.clear();
-    success=success && host_olist.alloc_rw(this->max_local);
-  }
-  if (max_nbors>_max_nbors)
-    _max_nbors=static_cast<int>(static_cast<double>(max_nbors)*1.10);
-  this->nbor.resize(this->max_local,_max_nbors,success);
-}
-
-template <class numtyp, class acctyp>
-void GB_GPU_MemoryT::clear() {
-  if (!this->allocated)
-    return;
-
-  int err_flag;
-  this->dev_error.copy_to_host(&err_flag);
-  if (err_flag == 1)
-    std::cerr << "COLLISION BUFFER OVERFLOW OCCURED. INCREASE COLLISION_N "
-              << "and RECOMPILE.\n";
-  else if (err_flag == 2)
-    std::cerr << "BAD MATRIX INVERSION IN FORCE COMPUTATION.\n";  
-
-  LJ_GPU_MemoryT::clear();      
-  
-  lshape.unbind();
-
-  shape.clear();
-  well.clear();
-  form.clear();
-  lshape.clear();
-  gamma_upsilon_mu.clear();
-  host_olist.clear();
-}  
- 
-template <class numtyp, class acctyp>
-double GB_GPU_MemoryT::host_memory_usage() {
-  return this->atom.host_memory_usage(this->max_atoms)+
-         this->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(GB_GPU_Memory<numtyp,acctyp>)+this->max_atoms*sizeof(int);
-}
-
-template class GB_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@ -12,61 +12,183 @@
 ------------------------------------------------------------------------- */

 /* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
 ------------------------------------------------------------------------- */

 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H

-#define MAX_GPU_THREADS 4
-#include "lj_gpu_memory.h"
+#define BLOCK_1D 64

-enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
+#include "pair_gpu_device.h"
+#include "pair_gpu_balance.h"
+#include "mpi.h"

 template <class numtyp, class acctyp>
-class GB_GPU_Memory : public LJ_GPU_Memory<numtyp,acctyp> {
+class GB_GPU_Memory {
 public:
  GB_GPU_Memory();
  ~GB_GPU_Memory(); 
- 
-  bool init(const int ij_size, const int ntypes, const double gamma,
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param gpu_nbor true if neighboring performed on device
+    * \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device 
+    * \return false if there is not sufficient memory or device init prob **/
+  bool init(const int ntypes, const double gamma,
            const double upsilon, const double mu, double **host_shape,
            double **host_well, double **host_cutsq, double **host_sigma, 
            double **host_epsilon, double *host_lshape, int **h_form,
            double **host_lj1, double **host_lj2, double **host_lj3, 
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const int nlocal, const int nall,
-            const bool force_d, const int me);
+            double **host_lj4, double **host_offset, 
+            const double *host_special_lj, const int nlocal, const int nall, 
+            const int max_nbors, const double cell_size,
+            const double gpu_split, FILE *screen);

-  void resize_atom(const int nall, bool &success);
-  void resize_local(const int nlocal, const int max_nbors, bool &success);
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    atom->resize(inum, nall, success);
+    if (multiple_forms) atom->dev_ans.zero();
+    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    if (bytes>_max_bytes)
+      _max_bytes=bytes;
+  }
+
+  /// Check if there is enough storage for neighbors and realloc if not
+  /** \param nlocal number of particles whose nbors must be stored on device
+    * \param host_inum number of particles whose nbors need to copied to host
+    * \param current maximum number of neighbors
+    * \param olist_size size of list of particles from CPU neighboring
+    * \note host_inum is 0 if the host is performing neighboring
+    * \note if GPU is neighboring nlocal+host_inum=total number local particles
+    * \note if CPU is neighboring olist_size=total number of local particles 
+    * \note if GPU is neighboring olist_size=0 **/
+  inline void resize_local(const int nlocal, const int host_inum,
+                           const int max_nbors, const int olist_size,
+                           bool &success) {
+    if (olist_size>static_cast<int>(host_olist.numel())) {
+      host_olist.clear();
+      int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
+      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
+    }
+    nbor->resize(nlocal,host_inum,max_nbors,success);
+    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    if (bytes>_max_bytes)
+      _max_bytes=bytes;
+  }
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
  void clear();
 
-  double host_memory_usage();
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (nbor_time_avail) {
+      nbor->time_nbor.add_to_total();
+      nbor->time_kernel.add_to_total();
+      nbor_time_avail=false;
+    }
+    time_kernel.add_to_total();
+    time_gayberne.add_to_total();
+    if (multiple_forms) {
+      time_kernel2.add_to_total();
+      time_gayberne2.add_to_total();
+      time_pair.add_to_total();
+    }
+    atom->acc_timers();
+  }
  
-  // ----------------------------  DATA  ----------------------------
+  /// Accumulate timers
+  inline void zero_timers() {
+    nbor_time_avail=false;
+    time_kernel.zero();
+    time_gayberne.zero();
+    if (multiple_forms) {
+      time_kernel2.zero();
+      time_gayberne2.zero();
+      time_pair.zero();
+    }
+    atom->zero_timers();
+  }

-  // ilist with particles sorted by type
-  NVC_HostI host_olist;
-
-  // --------------- Const Data for Atoms
-  NVC_ConstMatT shape, well;
-  NVC_ConstMatI form;
-  NVC_VecT lshape, gamma_upsilon_mu;
+  // -------------------------- DEVICE DATA ------------------------- 
  
+  /// Device Properties and Atom and Neighbor storage
+  PairGPUDevice<numtyp,acctyp> *device;
+  /// Geryon device
+  UCL_Device *ucl_device;
+  
+  /// Device Error Flag - Set if a bad matrix inversion occurs
+  UCL_D_Vec<int> dev_error;
+  /// Device timers
+  UCL_Timer time_kernel, time_gayberne, time_kernel2, time_gayberne2, time_pair;
+  /// Host device load balancer
+  PairGPUBalance<numtyp,acctyp> hd_balancer;
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+  
+  // --------------------------- TYPE DATA -------------------------- 

-  // --------------- Timing Stuff
-  NVCTimer time_kernel, time_gayberne, time_kernel2, time_gayberne2;
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = form
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// sigma_epsilon.x = sigma, sigma_epsilon.y = epsilon
+  UCL_D_Vec<numtyp2> sigma_epsilon;
+  /// cut_form.x = cutsq, cut_form.y = form
+  UCL_D_Vec<numtyp2> cut_form;
+  // 0 - gamma, 1-upsilon, 2-mu, 3-special_lj[0], 4-special_lj[1], ...
+  UCL_D_Vec<numtyp> gamma_upsilon_mu;
  
  // True if we want to use fast GB-sphere or sphere-sphere calculations 
  bool multiple_forms;
  int **host_form;
-  int last_ellipse;
-  int _max_nbors;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+  int _lj_types;
   
+  // --------------------------- ATOM DATA -------------------------- 
+
+  /// Atom Data
+  PairGPUAtom<numtyp,acctyp> *atom;
+
+  /// Aspherical Const Data for Atoms
+  UCL_D_Vec<numtyp4> shape, well;
+  /// Aspherical Const Data for Atoms
+  UCL_D_Vec<numtyp> lshape;
+
+  int last_ellipse, max_last_ellipse;
+
+  // --------------------------- NBOR DATA ----------------------------
+
+  /// Neighbor data
+  PairGPUNbor *nbor;
+  /// ilist with particles sorted by type
+  UCL_H_Vec<int> host_olist;
+  /// True if we should accumulate the neighbor timer
+  bool nbor_time_avail;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pair_program, *gb_program, *gb_lj_program;
+  UCL_Kernel k_gb_nbor_fast, k_gb_nbor;
+  UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
+  inline int block_size() { return _block_size; }
+
 private:
+  bool _allocated, _compiled;
+  int _block_size;
+  double _max_bytes;
+  
+  void compile_kernels(UCL_Device &dev);
 };

 #endif
--- a/lib/gpu/geryon/README
+++ b/lib/gpu/geryon/README
@ -0,0 +1,27 @@
+Geryon
+
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+
+Geryon is intended to be a simple library for managing the CUDA Runtime,
+CUDA Driver, and OpenCL APIs with a consistent interface:
+
+    * Change from one API to another by simply changing the namespace
+    * Use multiple APIs in the same code
+    * Lightweight (only include files - no build required)
+    * Manage device query and selection
+    * Simple vector and matrix containers
+    * Simple routines for data copy and type casting
+    * Simple routines for data I/O
+    * Simple classes for managing device timing
+    * Simple classes for managing kernel compilation and execution
+    
+Geryon does not require building (although a Makefile is provided for testing 
+purposes). The library is a set of header files that can be included with 
+your code.
+
+Documentation and examples are provided at 
+
+http://users.nccs.gov/~wb8/geryon/index.htm
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@ -0,0 +1 @@
+Geryon Version 10.280
--- a/lib/gpu/geryon/file_to_cstr.sh
+++ b/lib/gpu/geryon/file_to_cstr.sh
@ -0,0 +1,47 @@
+#!/bin/sh
+
+# convert ptx assembly output into 
+# a c-style string constant written
+# in portable posix shell script.
+# requires: sed, rm, mv
+#
+# Author: Axel Kohlmeyer, Temple University
+
+num_args=$#
+
+# we write to a scratch file, since
+# we know the real file name only at
+# the very end.
+output=geryon.tmp.$$
+: > $output
+
+# remove temporary file in case we're interrupted. 
+cleanup () {
+  rm -f geryon.tmp.$$
+}
+trap cleanup INT QUIT TERM
+
+# loop over arguments and convert to 
+# string constants. 
+i=1
+while [ $i -lt $num_args ]
+do \
+  src=$1
+  krn=${src##*/}
+  krn=${krn%.*}
+  echo "Converting kernel $krn from $src to a c-style string"
+  echo "const char * $krn = " >> $output
+  sed -e 's/\\/\\\\/g'   \
+      -e 's/"/\\"/g'     \
+      -e 's/ *\/\/.*$//' \
+      -e '/\.file/D'     \
+      -e '/^[ 	]*$/D'   \
+      -e 's/^\(.*\)$/"\1\\n"/' $src >> $output
+  echo ';' >> $output
+  shift
+  i=`expr $i + 1`
+done
+
+# $1 holds now the real output file name
+mv $output $1
+
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@ -0,0 +1,311 @@
+/***************************************************************************
+                                nvc_device.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with cuda devices
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Wed Jan 28 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVC_DEVICE
+#define NVC_DEVICE
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include <cstdlib>
+#include "nvc_macros.h"
+#include "ucl_types.h"
+
+namespace ucl_cudart {
+
+// --------------------------------------------------------------------------
+// - COMMAND QUEUE STUFF
+// --------------------------------------------------------------------------
+typedef cudaStream_t command_queue; 
+
+inline void ucl_sync(cudaStream_t &stream) {
+  CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+}
+
+/// Class for looking at device properties
+/** \note Calls to change the device outside of the class results in incorrect
+  *       behavior 
+  * \note There is no error checking for indexing past the number of devices **/
+class UCL_Device {
+ public:
+  /// Collect properties for every GPU on the node
+  /** \note You must set the active GPU with set() before using the device **/
+  UCL_Device();
+  
+  ~UCL_Device();
+
+  /// Returns 1 (For compatibility with OpenCL)
+  inline int num_platforms() { return 1; }
+
+  /// Return a string with name and info of the current platform
+  std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA"; }
+
+  /// Return the number of devices that support CUDA
+  inline int num_devices() { return _properties.size(); }
+
+  /// Set the CUDA device to the specified device number
+  void set(int num);
+
+  /// Get the current device number
+  inline int device_num() { return _device; }
+
+  /// Returns the default stream for the current device
+  inline command_queue & cq() { return cq(0); }
+  
+  /// Returns the stream indexed by i
+  inline command_queue & cq(const int i) { return _cq[i]; }
+
+  /// Block until all commands in the default stream have completed
+  inline void sync() { sync(0); }
+  
+  /// Block until all commands in the specified stream have completed
+  inline void sync(const int i) { ucl_sync(cq(i)); }
+  
+  /// Get the number of command queues currently available on device
+  inline int num_queues() 
+    { if (_device==-1) return 0; else return _cq.size(); }
+  
+  /// Add a stream for device computations
+  inline void push_command_queue() {
+    _cq.push_back(cudaStream_t()); 
+    CUDA_SAFE_CALL_NS(cudaStreamCreate(&_cq.back())); 
+  }
+
+  /// Remove a stream for device computations
+  /** \note You cannot delete the default stream **/
+  inline void pop_command_queue() {
+    if (_cq.size()<2) return;
+    CUDA_SAFE_CALL_NS(cudaStreamDestroy(_cq.back()));
+    _cq.pop_back();
+  }
+  
+  /// Get the current CUDA device name
+  inline std::string name() { return name(_device); }
+  /// Get the CUDA device name
+  inline std::string name(const int i) 
+    { return std::string(_properties[i].name); }
+
+  /// Get a string telling the type of the current device
+  inline std::string device_type_name() { return device_type_name(_device); }
+  /// Get a string telling the type of the device
+  inline std::string device_type_name(const int i) { return "GPU"; }
+
+  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type() { return device_type(_device); }
+  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type(const int i) { return UCL_GPU; }
+  
+  /// Returns true if double precision is support for the current device
+  bool double_precision() { return double_precision(_device); }
+  /// Returns true if double precision is support for the device
+  bool double_precision(const int i) {return arch(i)>=1.3;}
+  
+  /// Get the number of cores in the current device
+  inline unsigned cores() { return cores(_device); }
+  /// Get the number of cores
+  inline unsigned cores(const int i) 
+    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
+      else return _properties[i].multiProcessorCount*32; }
+  
+  /// Get the gigabytes of global memory in the current device
+  inline double gigabytes() { return gigabytes(_device); }
+  /// Get the gigabytes of global memory
+  inline double gigabytes(const int i) 
+    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
+  
+  /// Get the bytes of global memory in the current device
+  inline size_t bytes() { return bytes(_device); }
+  /// Get the bytes of global memory
+  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
+
+  /// Return the GPGPU compute capability for current device
+  inline double arch() { return arch(_device); }
+  /// Return the GPGPU compute capability
+  inline double arch(const int i) 
+    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
+  
+  /// Clock rate in GHz for current device
+  inline double clock_rate() { return clock_rate(_device); }
+  /// Clock rate in GHz
+  inline double clock_rate(const int i) { return _properties[i].clockRate*1e-6;}
+               
+  /// Get the maximum number of threads per block
+  inline size_t group_size() { return group_size(_device); }
+  /// Get the maximum number of threads per block
+  inline size_t group_size(const int i) 
+    { return _properties[i].maxThreadsPerBlock; }
+  
+  /// Return the maximum memory pitch in bytes for current device
+  inline size_t max_pitch() { return max_pitch(_device); }
+  /// Return the maximum memory pitch in bytes
+  inline size_t max_pitch(const int i) { return _properties[i].memPitch; }
+
+  /// List all devices along with all properties
+  void print_all(std::ostream &out);
+ 
+ private:
+  int _device, _num_devices;
+  std::vector<cudaDeviceProp> _properties;
+  std::vector<cudaStream_t> _cq;
+};
+
+// Grabs the properties for all devices
+inline UCL_Device::UCL_Device() {
+  CUDA_SAFE_CALL_NS(cudaGetDeviceCount(&_num_devices));
+  for (int dev=0; dev<_num_devices; ++dev) {
+    cudaDeviceProp deviceProp;
+    CUDA_SAFE_CALL_NS(cudaGetDeviceProperties(&deviceProp, dev));
+    if (deviceProp.major == 9999 && deviceProp.minor == 9999)
+      break;
+    _properties.push_back(deviceProp);
+  }
+  _device=-1;
+  _cq.push_back(cudaStream_t());
+  _cq.back()=0;
+}
+
+inline UCL_Device::~UCL_Device() {
+  for (int i=1; i<num_queues(); i++) pop_command_queue();
+}
+
+// Set the CUDA device to the specified device number
+inline void UCL_Device::set(int num) {
+  if (_device==num)
+    return;
+  for (int i=1; i<num_queues(); i++) pop_command_queue();
+  cudaThreadExit();
+  CUDA_SAFE_CALL_NS(cudaSetDevice(num));
+  _device=num;
+}
+
+// List all devices along with all properties
+inline void UCL_Device::print_all(std::ostream &out) {
+  #if CUDART_VERSION >= 2020
+  int driver_version, runtime_version;
+  cudaDriverGetVersion(&driver_version);
+  out << "CUDA Driver Version:                           "
+      << driver_version/1000 << "." << driver_version%100
+		  << std::endl;
+  cudaRuntimeGetVersion(&runtime_version);
+	out << "CUDA Runtime Version:                          "
+	    << runtime_version/1000 << "." << runtime_version%100
+	    << std::endl;
+  #endif
+
+  if (num_devices() == 0)
+    out << "There is no device supporting CUDA\n";
+  for (int i=0; i<num_devices(); ++i) {
+    out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
+    out << "  Type of device:                                "
+        << device_type_name(i).c_str() << std::endl;
+    out << "  Compute capability:                            "
+        << arch(i) << std::endl;
+    out << "  Double precision support:                      ";
+    if (double_precision(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Total amount of global memory:                 "
+        << gigabytes(i) << " GB\n";
+    #if CUDART_VERSION >= 2000
+    out << "  Number of compute units/multiprocessors:       "
+        << _properties[i].multiProcessorCount << std::endl;
+    out << "  Number of cores:                               "
+        << cores(i) << std::endl;
+    #endif
+    out << "  Total amount of constant memory:               "
+        << _properties[i].totalConstMem << " bytes\n";
+    out << "  Total amount of local/shared memory per block: "
+        << _properties[i].sharedMemPerBlock << " bytes\n";
+    out << "  Total number of registers available per block: "
+        << _properties[i].regsPerBlock << std::endl;
+    out << "  Warp size:                                     "
+        << _properties[i].warpSize << std::endl;
+    out << "  Maximum number of threads per block:           "
+        << _properties[i].maxThreadsPerBlock << std::endl;
+    out << "  Maximum group size (# of threads per block)    "
+        << _properties[i].maxThreadsDim[0] << " x "
+        << _properties[i].maxThreadsDim[1] << " x "
+        << _properties[i].maxThreadsDim[2] << std::endl;
+    out << "  Maximum item sizes (# threads for each dim)    "
+        << _properties[i].maxGridSize[0] << " x "
+        << _properties[i].maxGridSize[1] << " x "
+        << _properties[i].maxGridSize[2] << std::endl;
+    out << "  Maximum memory pitch:                          "
+        << max_pitch(i) << " bytes\n";
+    out << "  Texture alignment:                             "
+        << _properties[i].textureAlignment << " bytes\n";
+    out << "  Clock rate:                                    "
+        << clock_rate(i) << " GHz\n";
+    #if CUDART_VERSION >= 2000
+    out << "  Concurrent copy and execution:                 ";
+    if (_properties[i].deviceOverlap)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    #endif
+    #if CUDART_VERSION >= 2020
+    out << "  Run time limit on kernels:                     ";
+    if (_properties[i].kernelExecTimeoutEnabled)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Integrated:                                    ";
+    if (_properties[i].integrated)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Support host page-locked memory mapping:       ";
+    if (_properties[i].canMapHostMemory)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Compute mode:                                  ";
+    if (_properties[i].computeMode == cudaComputeModeDefault)
+      out << "Default\n"; // multiple threads can use device
+    else if (_properties[i].computeMode == cudaComputeModeExclusive)
+      out << "Exclusive\n"; // only thread can use device
+    else if (_properties[i].computeMode == cudaComputeModeProhibited)
+      out << "Prohibited\n"; // no thread can use device
+    else
+      out << "Unknown\n";
+    #endif
+    #if CUDART_VERSION >= 3000
+    out << "  Concurrent kernel execution:                   ";
+    if (_properties[i].concurrentKernels)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device has ECC support enabled:                ";
+    if (_properties[i].ECCEnabled)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    #endif
+  }
+}
+
+}
+
+#endif
+
--- a/lib/gpu/geryon/nvc_macros.h
+++ b/lib/gpu/geryon/nvc_macros.h
@ -0,0 +1,57 @@
+#ifndef NVC_MACROS_H
+#define NVC_MACROS_H
+
+#if defined(__APPLE__)
+#if _GLIBCXX_ATOMIC_BUILTINS == 1
+#undef _GLIBCXX_ATOMIC_BUILTINS
+#endif // _GLIBCXX_ATOMIC_BUILTINS
+#endif // __APPLE__
+
+#include <stdio.h>
+#include <cassert>
+#include <cuda_runtime.h>
+
+#ifdef MPI_GERYON
+#include "mpi.h"
+#define NVC_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1)
+#else
+#define NVC_GERYON_EXIT assert(0==1)
+#endif
+
+#ifndef UCL_NO_API_CHECK
+
+#define CUDA_SAFE_CALL_NS( call) do {                                        \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in call at file '%s' in line %i : %s.\n", \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        NVC_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#ifdef UCL_SYNC_DEBUG
+
+#define CUDA_SAFE_CALL( call) do {                                           \
+    CUDA_SAFE_CALL_NS( call);                                                \
+    cudaError err=cudaThreadSynchronize();                                             \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        NVC_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#else
+
+#define CUDA_SAFE_CALL( call) CUDA_SAFE_CALL_NS( call)
+
+#endif
+
+#else  // not DEBUG
+
+// void macros for performance reasons
+#define CUDA_SAFE_CALL( call) call
+#define CUDA_SAFE_CALL_NS( call) call
+
+#endif
+
+#endif
+
--- a/lib/gpu/geryon/nvc_texture.h
+++ b/lib/gpu/geryon/nvc_texture.h
@ -0,0 +1,69 @@
+/***************************************************************************
+                                nvc_texture.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with CUDA Runtime textures
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Fri Jul 2 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVC_TEXTURE
+#define NVC_TEXTURE
+
+#include "nvc_mat.h"
+
+namespace ucl_cudart {
+    
+/// Class storing a texture reference
+class UCL_Texture {
+ public:
+  UCL_Texture() {}
+  ~UCL_Texture() {}
+  /// Construct with a specified texture reference
+  inline UCL_Texture(textureReference *t) { get_texture(t); }
+  /// Set the texture reference for this object
+  inline void get_texture(textureReference *t) { _tex_ptr=t; }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class mat_typ>
+  inline void bind_float(mat_typ &vec, const unsigned numel) {
+    #ifdef UCL_DEBUG
+    assert(numel!=0 && numel<5);
+    #endif
+    int bits[4]={0,0,0,0};
+    for (int i=0; i<numel; i++) bits[i]=32;
+    _channel = cudaCreateChannelDesc(bits[0], bits[1], bits[2], bits[3], 
+                                     cudaChannelFormatKindFloat);
+    (*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
+    (*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
+    (*_tex_ptr).filterMode = cudaFilterModePoint;
+    (*_tex_ptr).normalized = false;
+    CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,vec.cbegin(),&_channel));
+  }
+
+  /// Unbind the texture reference from the memory allocation
+  inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
+
+ private:
+  textureReference *_tex_ptr;
+  cudaChannelFormatDesc _channel;
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -0,0 +1,359 @@
+/***************************************************************************
+                                nvd_device.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with cuda devices
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jan 21 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVD_DEVICE
+#define NVD_DEVICE
+
+#include <string>
+#include <vector>
+#include <iostream>
+#include "nvd_macros.h"
+#include "ucl_types.h"
+
+namespace ucl_cudadr {
+
+// --------------------------------------------------------------------------
+// - COMMAND QUEUE STUFF
+// --------------------------------------------------------------------------
+typedef CUstream command_queue; 
+
+inline void ucl_sync(CUstream &stream) {
+  CU_SAFE_CALL(cuStreamSynchronize(stream));
+}
+
+struct NVDProperties {
+  std::string name;
+  int major;
+  int minor;
+  CUDA_INT_TYPE totalGlobalMem;
+  int multiProcessorCount;
+  CUdevprop_st p;
+  int kernelExecTimeoutEnabled;
+  int integrated;
+  int canMapHostMemory;
+  int concurrentKernels;
+  int ECCEnabled;
+};
+
+/// Class for looking at device properties
+/** \note Calls to change the device outside of the class results in incorrect
+  *       behavior 
+  * \note There is no error checking for indexing past the number of devices **/
+class UCL_Device {
+ public:
+  /// Collect properties for every GPU on the node
+  /** \note You must set the active GPU with set() before using the device **/
+  UCL_Device();
+  
+  ~UCL_Device();
+
+  /// Returns 1 (For compatibility with OpenCL)
+  inline int num_platforms() { return 1; }
+
+  /// Return a string with name and info of the current platform
+  std::string platform_name() { return "NVIDIA Corporation NVIDIA CUDA Driver"; }
+
+  /// Return the number of devices that support CUDA
+  inline int num_devices() { return _properties.size(); }
+
+  /// Set the CUDA device to the specified device number
+  /** A context and default command queue will be created for the device **/
+  void set(int num);
+
+  /// Get the current device number
+  inline int device_num() { return _device; }
+
+  /// Returns the default stream for the current device
+  inline command_queue & cq() { return cq(0); }
+  
+  /// Returns the stream indexed by i
+  inline command_queue & cq(const int i) { return _cq[i]; }
+  
+  /// Block until all commands in the default stream have completed
+  inline void sync() { sync(0); }
+  
+  /// Block until all commands in the specified stream have completed
+  inline void sync(const int i) { ucl_sync(cq(i)); }
+  
+  /// Get the number of command queues currently available on device
+  inline int num_queues() 
+    { return _cq.size(); }
+  
+  /// Add a stream for device computations
+  inline void push_command_queue() {
+    _cq.push_back(CUstream()); 
+    CU_SAFE_CALL(cuStreamCreate(&_cq.back(),0)); 
+  }
+
+  /// Remove a stream for device computations
+  /** \note You cannot delete the default stream **/
+  inline void pop_command_queue() {
+    if (_cq.size()<2) return;
+    CU_SAFE_CALL_NS(cuStreamDestroy(_cq.back()));
+    _cq.pop_back();
+  }
+  
+  /// Get the current CUDA device name
+  inline std::string name() { return name(_device); }
+  /// Get the CUDA device name
+  inline std::string name(const int i) 
+    { return std::string(_properties[i].name); }
+
+  /// Get a string telling the type of the current device
+  inline std::string device_type_name() { return device_type_name(_device); }
+  /// Get a string telling the type of the device
+  inline std::string device_type_name(const int i) { return "GPU"; }
+
+  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type() { return device_type(_device); }
+  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type(const int i) { return UCL_GPU; }
+  
+  /// Returns true if double precision is support for the current device
+  bool double_precision() { return double_precision(_device); }
+  /// Returns true if double precision is support for the device
+  bool double_precision(const int i) {return arch(i)>=1.3;}
+  
+  /// Get the number of cores in the current device
+  inline unsigned cores() { return cores(_device); }
+  /// Get the number of cores
+  inline unsigned cores(const int i) 
+    { if (arch(i)<2.0) return _properties[i].multiProcessorCount*8; 
+      else return _properties[i].multiProcessorCount*32; }
+  
+  /// Get the gigabytes of global memory in the current device
+  inline double gigabytes() { return gigabytes(_device); }
+  /// Get the gigabytes of global memory
+  inline double gigabytes(const int i) 
+    { return static_cast<double>(_properties[i].totalGlobalMem)/1073741824; }
+  
+  /// Get the bytes of global memory in the current device
+  inline size_t bytes() { return bytes(_device); }
+  /// Get the bytes of global memory
+  inline size_t bytes(const int i) { return _properties[i].totalGlobalMem; }
+
+  // Get the gigabytes of free memory in the current device
+  inline double free_gigabytes() { return free_gigabytes(_device); }
+  // Get the gigabytes of free memory
+  inline double free_gigabytes(const int i) 
+    { return static_cast<double>(free_bytes(i))/1073741824; }
+  
+  // Get the bytes of free memory in the current device
+  inline size_t free_bytes() { return free_bytes(_device); }
+  // Get the bytes of free memory
+  inline size_t free_bytes(const int i) { 
+    CUDA_INT_TYPE dfree, dtotal;
+    CU_SAFE_CALL_NS(cuMemGetInfo(&dfree, &dtotal));
+    return static_cast<size_t>(dfree);
+  }
+
+  /// Return the GPGPU compute capability for current device
+  inline double arch() { return arch(_device); }
+  /// Return the GPGPU compute capability
+  inline double arch(const int i) 
+    { return static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
+  
+  /// Clock rate in GHz for current device
+  inline double clock_rate() { return clock_rate(_device); }
+  /// Clock rate in GHz
+  inline double clock_rate(const int i) 
+    { return _properties[i].p.clockRate*1e-6;}
+               
+  /// Get the maximum number of threads per block
+  inline size_t group_size() { return group_size(_device); }
+  /// Get the maximum number of threads per block
+  inline size_t group_size(const int i) 
+    { return _properties[i].p.maxThreadsPerBlock; }
+  
+  /// Return the maximum memory pitch in bytes for current device
+  inline size_t max_pitch() { return max_pitch(_device); }
+  /// Return the maximum memory pitch in bytes
+  inline size_t max_pitch(const int i) { return _properties[i].p.memPitch; }
+
+  /// List all devices along with all properties
+  void print_all(std::ostream &out);
+ 
+ private:
+  int _device, _num_devices;
+  std::vector<NVDProperties> _properties;
+  std::vector<CUstream> _cq;
+  CUdevice _cu_device;
+  CUcontext _context;
+};
+
+// Grabs the properties for all devices
+inline UCL_Device::UCL_Device() {
+  CU_SAFE_CALL_NS(cuInit(0));
+  CU_SAFE_CALL_NS(cuDeviceGetCount(&_num_devices));
+  for (int dev=0; dev<_num_devices; ++dev) {
+    CUdevice m;
+    CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
+    _properties.push_back(NVDProperties());
+    
+    char namecstr[1024];
+    CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
+    _properties.back().name=namecstr;
+    
+    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
+                                              &_properties.back().minor,m));
+    
+    CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
+                                       CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
+                                         m));
+    CU_SAFE_CALL_NS(cuDeviceGetProperties(&_properties.back().p,m));
+    #if CUDA_VERSION >= 2020
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
+                      &_properties.back().kernelExecTimeoutEnabled, 
+                      CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT,dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
+                      &_properties.back().integrated,
+                      CU_DEVICE_ATTRIBUTE_INTEGRATED, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
+                      &_properties.back().canMapHostMemory, 
+                      CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev));
+    #endif
+    #if CUDA_VERSION >= 3000
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
+                      &_properties.back().concurrentKernels, 
+                      CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev));
+    CU_SAFE_CALL_NS(cuDeviceGetAttribute(
+                      &_properties.back().ECCEnabled,  
+                      CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev));
+    #endif
+  }
+  _device=-1;
+  _cq.push_back(CUstream());
+  _cq.back()=0;
+}
+
+inline UCL_Device::~UCL_Device() {
+  if (_device>-1) {
+    for (int i=1; i<num_queues(); i++) pop_command_queue();
+    cuCtxDestroy(_context);
+  }
+}
+
+// Set the CUDA device to the specified device number
+inline void UCL_Device::set(int num) {
+  if (_device==num)
+    return;
+  if (_device>-1) {
+    CU_SAFE_CALL_NS(cuCtxDestroy(_context));
+    for (int i=1; i<num_queues(); i++) pop_command_queue();
+  }
+  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
+  CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
+  _device=num;
+}
+
+// List all devices along with all properties
+inline void UCL_Device::print_all(std::ostream &out) {
+  #if CUDA_VERSION >= 2020
+  int driver_version;
+  cuDriverGetVersion(&driver_version);
+  out << "CUDA Driver Version:                           "
+      << driver_version/1000 << "." << driver_version%100
+		  << std::endl;
+  #endif
+
+  if (num_devices() == 0)
+    out << "There is no device supporting CUDA\n";
+  for (int i=0; i<num_devices(); ++i) {
+    out << "\nDevice " << i << ": \"" << name(i) << "\"\n";
+    out << "  Type of device:                                "
+        << device_type_name(i).c_str() << std::endl;
+    out << "  Compute capability:                            "
+        << arch(i) << std::endl;
+    out << "  Double precision support:                      ";
+    if (double_precision(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Total amount of global memory:                 "
+        << gigabytes(i) << " GB\n";
+    #if CUDA_VERSION >= 2000
+    out << "  Number of compute units/multiprocessors:       "
+        << _properties[i].multiProcessorCount << std::endl;
+    out << "  Number of cores:                               "
+        << cores(i) << std::endl;
+    #endif
+    out << "  Total amount of constant memory:               "
+        << _properties[i].p.totalConstantMemory << " bytes\n";
+    out << "  Total amount of local/shared memory per block: "
+        << _properties[i].p.sharedMemPerBlock << " bytes\n";
+    out << "  Total number of registers available per block: "
+        << _properties[i].p.regsPerBlock << std::endl;
+    out << "  Warp size:                                     "
+        << _properties[i].p.SIMDWidth << std::endl;
+    out << "  Maximum number of threads per block:           "
+        << _properties[i].p.maxThreadsPerBlock << std::endl;
+    out << "  Maximum group size (# of threads per block)    "
+        << _properties[i].p.maxThreadsDim[0] << " x "
+        << _properties[i].p.maxThreadsDim[1] << " x "
+        << _properties[i].p.maxThreadsDim[2] << std::endl;
+    out << "  Maximum item sizes (# threads for each dim)    "
+        << _properties[i].p.maxGridSize[0] << " x "
+        << _properties[i].p.maxGridSize[1] << " x "
+        << _properties[i].p.maxGridSize[2] << std::endl;
+    out << "  Maximum memory pitch:                          "
+        << max_pitch(i) << " bytes\n";
+    out << "  Texture alignment:                             "
+        << _properties[i].p.textureAlign << " bytes\n";
+    out << "  Clock rate:                                    "
+        << clock_rate(i) << " GHz\n";
+    #if CUDA_VERSION >= 2020
+    out << "  Run time limit on kernels:                     ";
+    if (_properties[i].kernelExecTimeoutEnabled)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Integrated:                                    ";
+    if (_properties[i].integrated)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Support host page-locked memory mapping:       ";
+    if (_properties[i].canMapHostMemory)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    #endif
+    #if CUDA_VERSION >= 3000
+    out << "  Concurrent kernel execution:                   ";
+    if (_properties[i].concurrentKernels)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Device has ECC support enabled:                ";
+    if (_properties[i].ECCEnabled)
+      out << "Yes\n";
+    else
+      out << "No\n";
+    #endif
+  }
+}
+
+}
+
+#endif
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -0,0 +1,259 @@
+/***************************************************************************
+                                nvd_kernel.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with CUDA Driver kernels
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Tue Feb 9 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVD_KERNEL
+#define NVD_KERNEL
+
+#include "nvd_device.h"
+#include <fstream>
+
+namespace ucl_cudadr {
+
+class UCL_Texture;
+    
+/// Class storing 1 or more kernel functions from a single string or file
+class UCL_Program {
+ public:
+  inline UCL_Program(UCL_Device &device) {}
+  inline ~UCL_Program() {}
+
+  /// Initialize the program with a device
+  inline void init(UCL_Device &device) { }
+
+  /// Clear any data associated with program
+  /** \note Must call init() after each clear **/
+  inline void clear() { }
+
+  /// Load a program from a file and compile with flags
+  inline int load(const char *filename, const char *flags="",
+                  std::string *log=NULL) {
+    std::ifstream in(filename);
+    if (!in || in.is_open()==false) {
+      #ifndef UCL_NO_EXIT 
+      std::cerr << "UCL Error: Could not open kernel file: " 
+                << filename << std::endl;
+      exit(1);
+      #endif
+      return UCL_FILE_NOT_FOUND;
+    }
+  
+    std::string program((std::istreambuf_iterator<char>(in)),
+                        std::istreambuf_iterator<char>());
+    in.close();
+    return load_string(program.c_str(),flags,log);
+  }
+  
+  /// Load a program from a string and compile with flags
+  inline int load_string(const char *program, const char *flags="",
+                         std::string *log=NULL) {
+    if (std::string(flags)=="BINARY")
+      return load_binary(program);
+    const unsigned int num_opts=2;
+    CUjit_option options[num_opts];
+    void *values[num_opts];
+
+    // set up size of compilation log buffer
+    options[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+    values[0] = (void *)(int)10240;
+    // set up pointer to the compilation log buffer
+    options[1] = CU_JIT_INFO_LOG_BUFFER;
+    char clog[10240];
+    values[1] = clog;
+
+    CUresult err=cuModuleLoadDataEx(&_module,program,num_opts,
+                                    options,(void **)values);
+                                        
+    if (log!=NULL)
+      *log=std::string(clog);
+      
+    if (err != CUDA_SUCCESS) {
+      #ifndef UCL_NO_EXIT                                                 
+      std::cerr << std::endl
+                << "----------------------------------------------------------\n"
+                << " UCL Error: Error compiling PTX Program...\n"
+                << "----------------------------------------------------------\n";
+      std::cerr << log << std::endl;
+      #endif
+      return UCL_COMPILE_ERROR;
+    }
+    
+    return UCL_SUCCESS;
+  }                                      
+                              
+  /// Load a precompiled program from a file
+  inline int load_binary(const char *filename) {
+    CUmodule _module;
+    CUresult err = cuModuleLoad(&_module,filename);
+    if (err==301) {
+      #ifndef UCL_NO_EXIT 
+      std::cerr << "UCL Error: Could not open binary kernel file: " 
+                << filename << std::endl;
+      exit(1);
+      #endif
+      return UCL_FILE_NOT_FOUND;
+    } else if (err!=CUDA_SUCCESS) {
+      #ifndef UCL_NO_EXIT 
+      std::cerr << "UCL Error: Error loading binary kernel file: " 
+                << filename << std::endl;
+      exit(1);
+      #endif
+      return UCL_FILE_NOT_FOUND;
+    }
+    //int ucl_error=UCL_SUCCESS;
+    //if (err==301)
+    //  return UCL_FILE_NOT_FOUND;
+    //else if (err!=CUDA_SUCCESS)
+    //  return UCL_ERROR;
+    return UCL_SUCCESS;
+  }
+   
+  friend class UCL_Kernel;
+ private:
+  CUmodule _module;
+  friend class UCL_Texture;
+};
+
+/// Class for dealing with OpenCL kernels
+class UCL_Kernel {
+ public:
+  UCL_Kernel() : _dimensions(1), _num_args(0), _param_size(0) 
+    { _num_blocks[0]=0; }
+  
+  UCL_Kernel(UCL_Program &program, const char *function) : 
+    _dimensions(1), _num_args(0), _param_size(0) 
+    { _num_blocks[0]=0; set_function(program,function); }
+  
+  ~UCL_Kernel() {}
+
+  /// Clear any function associated with the kernel
+  inline void clear() { }
+
+  /// Get the kernel function from a program
+  /** \ret UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
+  inline int set_function(UCL_Program &program, const char *function) {
+    CUresult err=cuModuleGetFunction(&_kernel,program._module,function);
+    if (err!=CUDA_SUCCESS) {
+      #ifndef UCL_NO_EXIT
+      std::cerr << "UCL Error: Could not find function: " << function
+                << " in program.\n";
+      exit(1);
+      #endif
+      return UCL_FUNCTION_NOT_FOUND;
+    }
+    return UCL_SUCCESS;
+  }
+
+  /// Set the kernel argument.
+  /** If not a device pointer, this must be repeated each time the argument
+    * changes 
+    * \note To set kernel parameter i (i>0), parameter i-1 must be set **/
+  template <class dtype>
+  inline void set_arg(const unsigned index, dtype *arg) {
+    if (index==_num_args)
+      add_arg(arg);
+    else if (index<_num_args)
+      CU_SAFE_CALL(cuParamSetv(_kernel, _offsets[index], arg, sizeof(dtype)));
+    else
+      assert(0==1); // Must add kernel parameters in sequential order 
+  }
+ 
+  /// Add a kernel argument.
+  inline void add_arg(const CUdeviceptr* const arg) {
+    void* ptr = (void*)(size_t)(*arg);
+    _param_size = (_param_size + __alignof(ptr) - 1) & ~(__alignof(ptr) - 1);
+    CU_SAFE_CALL(cuParamSetv(_kernel, _param_size, &ptr, sizeof(ptr)));
+    _offsets.push_back(_param_size);
+    _param_size+=sizeof(ptr);
+    _num_args++;
+  }
+
+  /// Add a kernel argument.
+  template <class dtype>
+  inline void add_arg(const dtype* const arg) {
+    _param_size = (_param_size+__alignof(dtype)-1) & ~(__alignof(dtype)-1);
+    CU_SAFE_CALL(cuParamSetv(_kernel,_param_size,(void*)arg,sizeof(dtype)));
+    _offsets.push_back(_param_size);
+    _param_size+=sizeof(dtype);
+    _num_args++;
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  /** \note This should be called after all arguments have been added **/
+  inline void set_size(const size_t num_blocks, const size_t block_size) { 
+    _dimensions=1; 
+    _num_blocks[0]=num_blocks; 
+    _num_blocks[1]=1; 
+    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size,1,1));
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y) { 
+    _dimensions=2; 
+    _num_blocks[0]=num_blocks_x; 
+    _num_blocks[1]=num_blocks_y; 
+    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,1));
+  }
+  
+  /// Set the number of thread blocks and the number of threads in each block
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, 
+                       const size_t block_size_y, const size_t block_size_z) {
+    _dimensions=2; 
+    _num_blocks[0]=num_blocks_x; 
+    _num_blocks[1]=num_blocks_y; 
+    CU_SAFE_CALL(cuFuncSetBlockShape(_kernel,block_size_x,block_size_y,
+                                       block_size_z));
+  }
+
+  /// Run the kernel in the default command queue
+  inline void run() {
+    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
+    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],0));
+  }
+  
+  /// Run the kernel in the specified command queue
+  inline void run(command_queue &cq) {
+    CU_SAFE_CALL(cuParamSetSize(_kernel,_param_size));
+    CU_SAFE_CALL(cuLaunchGridAsync(_kernel,_num_blocks[0],_num_blocks[1],cq));
+  }
+  
+  /// Clear any arguments associated with the kernel
+  inline void clear_args() { _num_args=0; _offsets.clear(); _param_size=0; }
+
+  #include "ucl_arg_kludge.h"
+
+ private:
+  CUfunction _kernel;
+  unsigned _dimensions;
+  unsigned _num_blocks[2];
+  unsigned _num_args;
+  std::vector<unsigned> _offsets;
+  unsigned _param_size;
+  friend class UCL_Texture;
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/nvd_macros.h
+++ b/lib/gpu/geryon/nvd_macros.h
@ -0,0 +1,57 @@
+#ifndef NVD_MACROS_H
+#define NVD_MACROS_H
+
+#include <stdio.h>
+#include <cassert>
+#include <cuda.h>
+
+#if CUDA_VERSION >= 3020
+#define CUDA_INT_TYPE size_t
+#else
+#define CUDA_INT_TYPE unsigned
+#endif
+
+#ifdef MPI_GERYON
+#include "mpi.h"
+#define NVD_GERYON_EXIT MPI_Abort(MPI_COMM_WORLD,-1)
+#else
+#define NVD_GERYON_EXIT assert(0==1)
+#endif
+
+#ifndef UCL_NO_API_CHECK
+
+#define CU_SAFE_CALL_NS( call ) do {                                         \
+    CUresult err = call;                                                     \
+    if( CUDA_SUCCESS != err) {                                               \
+        fprintf(stderr, "Cuda driver error %d in call at file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#ifdef UCL_SYNC_DEBUG
+
+#define CU_SAFE_CALL( call ) do {                                            \
+    CU_SAFE_CALL_NS( call );                                                 \
+    CUresult err=cuCtxSynchronize();                                                  \
+    if( CUDA_SUCCESS != err) {                                               \
+        fprintf(stderr, "Cuda driver error %d in file '%s' in line %i.\n",   \
+                err, __FILE__, __LINE__ );                                   \
+        NVD_GERYON_EXIT;                                                     \
+    } } while (0)
+
+#else
+
+#define CU_SAFE_CALL( call ) CU_SAFE_CALL_NS( call )
+
+#endif
+
+#else  // not DEBUG
+
+// void macros for performance reasons
+#define CU_SAFE_CALL_NS( call ) call
+#define CU_SAFE_CALL( call) call
+
+#endif
+
+#endif
+
--- a/lib/gpu/geryon/nvd_mat.h
+++ b/lib/gpu/geryon/nvd_mat.h
@ -0,0 +1,54 @@
+/***************************************************************************
+                                  nvd_mat.h
+                             -------------------
+                               W. Michael Brown
+
+  CUDA Driver Specific Vector/Matrix Containers, Memory Management, and I/O
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jan 21 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+/*! \file */
+   
+#ifndef NVD_MAT_H
+#define NVD_MAT_H
+
+#include "nvd_memory.h"
+
+/// Namespace for CUDA Driver routines
+namespace ucl_cudadr {
+
+#define _UCL_MAT_ALLOW
+#define _UCL_DEVICE_PTR_MAT
+#include "ucl_basemat.h"
+#include "ucl_h_vec.h"
+#include "ucl_h_mat.h"
+#include "ucl_d_vec.h"
+#include "ucl_d_mat.h"
+#undef _UCL_DEVICE_PTR_MAT
+#undef _UCL_MAT_ALLOW
+
+#define UCL_COPY_ALLOW
+#include "ucl_copy.h"
+#undef UCL_COPY_ALLOW
+
+#define UCL_PRINT_ALLOW
+#include "ucl_print.h"
+#undef UCL_PRINT_ALLOW
+
+} // namespace ucl_cudadr 
+
+#endif
--- a/lib/gpu/geryon/nvd_memory.h
+++ b/lib/gpu/geryon/nvd_memory.h
@ -0,0 +1,610 @@
+/***************************************************************************
+                                nvd_memory.h
+                             -------------------
+                               W. Michael Brown
+
+  CUDA Driver Specific Memory Management and Vector/Matrix Containers
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jan 21 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVD_MEMORY_H
+#define NVD_MEMORY_H
+
+#include <iostream>
+#include <cassert>
+#include <cstring>
+#include "nvd_macros.h"
+#include "ucl_types.h"
+
+namespace ucl_cudadr {
+
+// --------------------------------------------------------------------------
+// - API Specific Types
+// --------------------------------------------------------------------------
+//typedef dim3 ucl_kernel_dim;
+
+// --------------------------------------------------------------------------
+// - API SPECIFIC DEVICE POINTERS
+// --------------------------------------------------------------------------
+typedef CUdeviceptr device_ptr;
+
+// --------------------------------------------------------------------------
+// - HOST MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,  
+                        const enum UCL_MEMOPT kind) {
+  CUresult err=CUDA_SUCCESS;
+  if (kind==UCL_RW_OPTIMIZED)  
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
+  else if (kind==UCL_WRITE_OPTIMIZED)
+    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
+  else
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _host_alloc(mat_type &mat, UCL_Device &dev, const size_t n,  
+                       const enum UCL_MEMOPT kind) {
+  CUresult err=CUDA_SUCCESS;
+  if (kind==UCL_RW_OPTIMIZED)  
+    err=cuMemAllocHost((void **)mat.host_ptr(),n);
+  else if (kind==UCL_WRITE_OPTIMIZED)
+    err=cuMemHostAlloc((void **)mat.host_ptr(),n,CU_MEMHOSTALLOC_WRITECOMBINED);
+  else
+    *(mat.host_ptr())=(typename mat_type::data_type*)malloc(n);
+  if (err!=CUDA_SUCCESS || *(mat.host_ptr())==NULL)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline void _host_free(mat_type &mat, const enum UCL_MEMOPT kind) {
+  if (kind!=UCL_NOT_PINNED)
+    CU_SAFE_CALL(cuMemFreeHost(mat.begin()));
+  else
+    free(mat.begin());
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE MEMORY ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  CUresult err=cuMemAlloc(&mat.cbegin(),n);
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &dev, const size_t n,
+                         const enum UCL_MEMOPT kind) {
+  CUresult err=cuMemAlloc(&mat.cbegin(),n);
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  CUresult err;
+  CUDA_INT_TYPE upitch;                        
+  err=cuMemAllocPitch(&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows,16);
+  pitch=static_cast<size_t>(upitch);                               
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}    
+
+template <class mat_type, class copy_type>
+inline int _device_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                         const size_t cols, size_t &pitch,
+                         const enum UCL_MEMOPT kind) {
+  CUresult err;
+  unsigned upitch;                        
+  err=cuMemAllocPitch(&mat.cbegin(),&upitch,
+                      cols*sizeof(typename mat_type::data_type),rows,16);
+  pitch=static_cast<size_t>(upitch);                               
+  if (err!=CUDA_SUCCESS)
+    return UCL_MEMORY_ERROR;
+  return UCL_SUCCESS;
+}    
+
+template <class mat_type>
+inline void _device_free(mat_type &mat) {
+  CU_SAFE_CALL(cuMemFree(mat.cbegin()));
+}
+
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in) { 
+  *ptr=in;
+}
+
+template <class numtyp>
+inline void _device_view(CUdeviceptr *ptr, numtyp *in) { 
+  *ptr=0; 
+}
+
+inline void _device_view(CUdeviceptr *ptr, CUdeviceptr &in, 
+                         const size_t offset, const size_t numsize) { 
+  *ptr=in+offset*numsize;
+}
+
+template <class numtyp>
+inline void _device_view(CUdeviceptr *ptr, numtyp *in,
+                         const size_t offset, const size_t numsize) { 
+  *ptr=0; 
+}
+
+// --------------------------------------------------------------------------
+// - DEVICE IMAGE ALLOCATION ROUTINES
+// --------------------------------------------------------------------------
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, copy_type &cm, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}    
+
+template <class mat_type, class copy_type>
+inline void _device_image_alloc(mat_type &mat, UCL_Device &d, const size_t rows,
+                                const size_t cols) {
+  assert(0==1);
+}    
+
+template <class mat_type>
+inline void _device_image_free(mat_type &mat) {
+  assert(0==1);
+}
+
+// --------------------------------------------------------------------------
+// - ZERO ROUTINES
+// --------------------------------------------------------------------------
+inline void _host_zero(void *ptr, const size_t n) {
+  memset(ptr,0,n);
+}
+
+template <class mat_type>
+inline void _device_zero(mat_type &mat, const size_t n) {
+  if (n%32==0)
+    CU_SAFE_CALL(cuMemsetD32(mat.cbegin(),0,n/4));
+  else if (n%16==0)
+    CU_SAFE_CALL(cuMemsetD16(mat.cbegin(),0,n/2));
+  else
+    CU_SAFE_CALL(cuMemsetD8(mat.cbegin(),0,n));
+}
+
+// --------------------------------------------------------------------------
+// - HELPER FUNCTIONS FOR MEMCPY ROUTINES
+// --------------------------------------------------------------------------
+
+inline void _nvd_set_2D_loc(CUDA_MEMCPY2D &ins, const size_t dpitch, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+  ins.srcXInBytes=0;
+  ins.srcY=0;
+  ins.srcPitch=spitch;
+  ins.dstXInBytes=0;
+  ins.dstY=0;
+  ins.dstPitch=dpitch;
+  ins.WidthInBytes=cols;
+  ins.Height=rows;
+}
+                            
+template <int mem> struct _nvd_set_2D_mem;
+template <> struct _nvd_set_2D_mem<1> 
+  { static CUmemorytype a() { return CU_MEMORYTYPE_HOST; } };
+template <> struct _nvd_set_2D_mem<2> 
+  { static CUmemorytype a() { return CU_MEMORYTYPE_ARRAY; } };
+template <int mem> struct _nvd_set_2D_mem 
+  { static CUmemorytype a() { return CU_MEMORYTYPE_DEVICE; } };
+
+
+// --------------------------------------------------------------------------
+// - MEMCPY ROUTINES
+// --------------------------------------------------------------------------
+
+template<int mem1, int mem2> struct _ucl_memcpy;
+
+// Both are images
+template<> struct _ucl_memcpy<2,2> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstArray=dst.cbegin();
+    ins.srcArray=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstArray=dst.cbegin();
+    ins.srcArray=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Destination is texture, source on device
+template<> struct _ucl_memcpy<2,0> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstArray=dst.cbegin();
+    ins.srcDevice=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstArray=dst.cbegin();
+    ins.srcDevice=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Destination is texture, source on host
+template<> struct _ucl_memcpy<2,1> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstArray=dst.cbegin();
+    ins.srcHost=src.begin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstArray=dst.cbegin();
+    ins.srcHost=src.begin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Source is texture, dest on device
+template<> struct _ucl_memcpy<0,2> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstDevice=dst.cbegin();
+    ins.srcArray=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstDevice=dst.cbegin();
+    ins.srcArray=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Source is texture, dest on host
+template<> struct _ucl_memcpy<1,2> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    assert(0==1);
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstHost=dst.begin();
+    ins.srcArray=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstHost=dst.begin();
+    ins.srcArray=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Neither are textures, destination on host
+template <> struct _ucl_memcpy<1,0> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    CU_SAFE_CALL(cuMemcpyDtoH(dst.begin(),src.cbegin(),n));
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    CU_SAFE_CALL(cuMemcpyDtoHAsync(dst.begin(),src.cbegin(),n,cq));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstHost=dst.begin();
+    ins.srcDevice=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstHost=dst.begin();
+    ins.srcDevice=src.cbegin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Neither are textures, source on host
+template <> struct _ucl_memcpy<0,1> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    CU_SAFE_CALL(cuMemcpyHtoD(dst.cbegin(),src.begin(),n));
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    CU_SAFE_CALL(cuMemcpyHtoDAsync(dst.cbegin(),src.begin(),n,cq));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstDevice=dst.cbegin();
+    ins.srcHost=src.begin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstDevice=dst.cbegin();
+    ins.srcHost=src.begin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Neither are textures, both on host
+template <> struct _ucl_memcpy<1,1> {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n)
+    { memcpy(dst.begin(),src.begin(),n); }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq)
+    { memcpy(dst.begin(),src.begin(),n); }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstHost=dst.begin();
+    ins.srcHost=src.begin();
+    CU_SAFE_CALL(cuMemcpy2D(&ins));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    CUDA_MEMCPY2D ins;
+    _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+    ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+    ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+    ins.dstHost=dst.begin();
+    ins.srcHost=src.begin();
+    CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+  }
+};
+
+// Neither are textures, both on device
+template <int mem1, int mem2> struct _ucl_memcpy {
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n) {
+    CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
+  }
+  template <class p1, class p2>
+  static inline void mc(p1 &dst, const p2 &src, const size_t n,
+                        CUstream &cq) {
+    CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin(),src.cbegin(),n));
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows) {
+    if (p1::PADDED==0 || p2::PADDED==0) {
+      size_t src_offset=0, dst_offset=0;
+      for (size_t i=0; i<rows; i++) {                       
+        CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
+                                  src.cbegin()+src_offset,cols));
+        src_offset+=spitch;
+        dst_offset+=dpitch;
+      }
+    } else {                                       
+      CUDA_MEMCPY2D ins;
+      _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+      ins.dstDevice=dst.cbegin();
+      ins.srcDevice=src.cbegin();
+      CU_SAFE_CALL(cuMemcpy2D(&ins));
+    }
+  }
+  template <class p1, class p2>
+      static inline void mc(p1 &dst, const size_t dpitch, const p2 &src, 
+                            const size_t spitch, const size_t cols,
+                            const size_t rows, CUstream &cq) {
+    if (p1::PADDED==0 || p2::PADDED==0) {
+      size_t src_offset=0, dst_offset=0;
+      for (size_t i=0; i<rows; i++) {                       
+        CU_SAFE_CALL(cuMemcpyDtoD(dst.cbegin()+dst_offset,
+                                  src.cbegin()+src_offset,cols));
+        src_offset+=spitch;
+        dst_offset+=dpitch;
+      }
+    } else {
+      CUDA_MEMCPY2D ins;
+      _nvd_set_2D_loc(ins,dpitch,spitch,cols,rows);
+      ins.dstMemoryType=_nvd_set_2D_mem<p1::MEM_TYPE>::a();                            
+      ins.srcMemoryType=_nvd_set_2D_mem<p2::MEM_TYPE>::a();                            
+      ins.dstDevice=dst.cbegin();
+      ins.srcDevice=src.cbegin();
+      CU_SAFE_CALL(cuMemcpy2DAsync(&ins,cq));
+    }
+  }
+};
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n) {
+  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n);
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const mat2 &src, const size_t n,
+                       CUstream &cq) {
+  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,src,n,cq);
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
+                       const size_t spitch, const size_t cols, 
+                       const size_t rows) {
+  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
+                                                 rows);
+}
+
+template<class mat1, class mat2>
+inline void ucl_mv_cpy(mat1 &dst, const size_t dpitch, const mat2 &src, 
+                       const size_t spitch, const size_t cols, 
+                       const size_t rows,CUstream &cq) {
+  _ucl_memcpy<mat1::MEM_TYPE,mat2::MEM_TYPE>::mc(dst,dpitch,src,spitch,cols,
+                                                 rows,cq);
+}
+
+} // namespace ucl_cudart 
+
+#endif
+
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@ -0,0 +1,71 @@
+/***************************************************************************
+                                nvd_texture.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with CUDA Driver textures
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Fri Jul 2 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVD_TEXTURE
+#define NVD_TEXTURE
+
+#include "nvd_kernel.h"
+#include "nvd_mat.h"
+
+namespace ucl_cudadr {
+    
+/// Class storing a texture reference
+class UCL_Texture {
+ public:
+  UCL_Texture() {}
+  ~UCL_Texture() {}
+  /// Construct with a specified texture reference
+  inline UCL_Texture(UCL_Program &prog, const char *texture_name)
+    { get_texture(prog,texture_name); }
+  /// Set the texture reference for this object
+  inline void get_texture(UCL_Program &prog, const char *texture_name)  
+    { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class mat_typ>
+  inline void bind_float(mat_typ &vec, const unsigned numel) {
+    #ifdef UCL_DEBUG
+    assert(numel!=0 && numel<5);
+    #endif
+    CU_SAFE_CALL(cuTexRefSetAddress(NULL, _tex, vec.cbegin(), 
+                 vec.numel()*vec.element_size()));
+    CU_SAFE_CALL(cuTexRefSetFormat(_tex, CU_AD_FORMAT_FLOAT, numel));
+  }
+
+  /// Unbind the texture reference from the memory allocation
+  inline void unbind() { }
+
+  /// Make a texture reference available to kernel  
+  inline void allow(UCL_Kernel &kernel) { 
+    CU_SAFE_CALL(cuParamSetTexRef(kernel._kernel, CU_PARAM_TR_DEFAULT, _tex)); 
+  }
+  
+ private:
+  CUtexref _tex;
+  friend class UCL_Kernel;
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@ -0,0 +1,106 @@
+/***************************************************************************
+                                 nvd_timer.h
+                             -------------------
+                               W. Michael Brown
+
+  Class for timing CUDA Driver routines
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Fri Jan 22 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef NVD_TIMER_H
+#define NVD_TIMER_H
+
+#include "nvd_macros.h"
+
+namespace ucl_cudadr {
+
+/// Class for timing CUDA Driver events
+class UCL_Timer {
+ public:
+  inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
+  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
+    { init(dev); }
+
+  inline ~UCL_Timer() { clear(); }
+
+  /// Clear any data associated with timer
+  /** \note init() must be called to reuse timer after a clear() **/
+  inline void clear() {
+    if (_initialized) { 
+      CU_SAFE_CALL(cuEventDestroy(start_event));
+      CU_SAFE_CALL(cuEventDestroy(stop_event));
+      _initialized=false;
+      _total_time=0.0;
+    }
+  }
+
+  /// Initialize default command queue for timing
+  inline void init(UCL_Device &dev) { init(dev, dev.cq()); }
+
+  /// Initialize command queue for timing
+  inline void init(UCL_Device &dev, command_queue &cq) {
+    clear();
+    _cq=cq;
+    _initialized=true;
+    CU_SAFE_CALL( cuEventCreate(&start_event,0) );
+    CU_SAFE_CALL( cuEventCreate(&stop_event,0) );
+  }
+
+  /// Start timing on command queue
+  inline void start() { CU_SAFE_CALL(cuEventRecord(start_event,_cq)); }
+  
+  /// Stop timing on command queue
+  inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
+  
+  /// Set the time elapsed to zero (not the total_time)
+  inline void zero() {
+    CU_SAFE_CALL(cuEventRecord(start_event,_cq));
+    CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
+  }
+  
+  /// Add time from previous start and stop to total
+  /** Forces synchronization **/
+  inline double add_to_total() 
+    { double t=time(); _total_time+=t; return t/1000.0; }
+  
+  /// Return the time (ms) of last start to stop - Forces synchronization
+  inline double time() { 
+    float timer;
+    CU_SAFE_CALL(cuEventSynchronize(stop_event));
+    CU_SAFE_CALL( cuEventElapsedTime(&timer,start_event,stop_event) );
+    return timer; 
+  }
+  
+  /// Return the time (s) of last start to stop - Forces synchronization
+  inline double seconds() { return time()/1000.0; }
+  
+  /// Return the total time in ms
+  inline double total_time() { return _total_time; }
+
+  /// Return the total time in seconds
+  inline double total_seconds() { return _total_time/1000.0; }
+
+ private:
+  CUevent start_event, stop_event;
+  CUstream _cq;
+  double _total_time;
+  bool _initialized;
+};
+
+} // namespace
+
+#endif
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -0,0 +1,449 @@
+/***************************************************************************
+                                ocl_device.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with OpenCL devices
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Mon Dec 23 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef OCL_DEVICE
+#define OCL_DEVICE
+
+#include <string>
+#include <vector>
+#include <iostream>
+
+#include "CL/cl.h"
+#include "CL/cl_platform.h"
+#include "ocl_macros.h"
+#include "ucl_types.h"
+
+namespace ucl_opencl {
+    
+// --------------------------------------------------------------------------
+// - COMMAND QUEUE STUFF
+// --------------------------------------------------------------------------
+typedef cl_command_queue command_queue; 
+typedef cl_context context_type;
+  
+inline void ucl_sync(cl_command_queue &cq) {
+  CL_SAFE_CALL(clFinish(cq));
+}
+
+struct OCLProperties {
+  std::string name;
+  cl_device_type device_type;
+  cl_ulong global_mem;
+  cl_ulong shared_mem;
+  cl_ulong const_mem;
+  cl_uint compute_units;
+  cl_uint clock;
+  size_t work_group_size;
+  size_t work_item_size[3];
+  bool double_precision;
+  int alignment;
+  size_t timer_resolution;
+};
+
+/// Class for looking at data parallel device properties
+/** \note Calls to change the device outside of the class results in incorrect
+  *       behavior 
+  * \note There is no error checking for indexing past the number of devices **/
+class UCL_Device {
+ public:
+  /// Collect properties for every device on the node
+   /** \note You must set the active GPU with set() before using the device **/
+  UCL_Device();
+  
+  ~UCL_Device();
+
+  /// Return the number of platforms (0 if error or no platforms)
+  inline int num_platforms() { return _num_platforms; }
+  
+  /// Return a string with name and info of the current platform
+  std::string platform_name();
+
+  /// Return the number of devices that support OpenCL
+  inline int num_devices() { return _num_devices; }
+
+  /// Set the OpenCL device to the specified device number
+  /** A context and default command queue will be created for the device **/
+  void set(int num);
+
+  /// Get the current device number
+  inline int device_num() { return _device; }
+  
+  /// Returns the context for the current device
+  inline cl_context & context() { return _context; }
+  
+  /// Returns the default stream for the current device
+  inline command_queue & cq() { return cq(0); }
+  
+  /// Returns the stream indexed by i
+  inline command_queue & cq(const int i) { return _cq[i]; }
+  
+  /// Block until all commands in the default stream have completed
+  inline void sync() { sync(0); }
+  
+  /// Block until all commands in the specified stream have completed
+  inline void sync(const int i) { ucl_sync(cq(i)); }
+  
+  /// Get the number of command queues currently available on device
+  inline int num_queues() 
+    { return _cq.size(); }
+  
+  /// Add a command queue for device computations (with profiling enabled)
+  inline void push_command_queue() {
+    cl_int errorv;
+    _cq.push_back(cl_command_queue());
+    _cq.back()=clCreateCommandQueue(_context,_cl_device,
+                                    CL_QUEUE_PROFILING_ENABLE,&errorv);
+    if (errorv!=CL_SUCCESS) {
+      std::cerr << "Could not create command queue on device: " << name() 
+                << std::endl;
+      exit(1);
+    }
+  }
+
+  /// Remove a stream for device computations
+  /** \note You cannot delete the default stream **/
+  inline void pop_command_queue() {
+    if (_cq.size()<2) return;
+    CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
+    _cq.pop_back();
+  }
+
+  /// Get the current OpenCL device name
+  inline std::string name() { return name(_device); }
+  /// Get the OpenCL device name
+  inline std::string name(const int i) 
+    { return std::string(_properties[i].name); }
+
+  /// Get a string telling the type of the current device
+  inline std::string device_type_name() { return device_type_name(_device); }
+  /// Get a string telling the type of the device
+  inline std::string device_type_name(const int i);
+  
+  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type() { return device_type(_device); }
+  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
+  inline int device_type(const int i);
+  
+  /// Returns true if double precision is support for the current device
+  bool double_precision() { return double_precision(_device); }
+  /// Returns true if double precision is support for the device
+  bool double_precision(const int i) {return _properties[i].double_precision;}
+   
+  /// Get the number of cores in the current device
+  inline unsigned cores() { return cores(_device); }
+  /// Get the number of cores
+  inline unsigned cores(const int i) 
+    { if (device_type(i)==UCL_CPU) return _properties[i].compute_units;
+      else return _properties[i].compute_units*8; }
+  
+  /// Get the gigabytes of global memory in the current device
+  inline double gigabytes() { return gigabytes(_device); }
+  /// Get the gigabytes of global memory
+  inline double gigabytes(const int i) 
+    { return static_cast<double>(_properties[i].global_mem)/1073741824; }
+
+  /// Get the bytes of global memory in the current device
+  inline size_t bytes() { return bytes(_device); }
+  /// Get the bytes of global memory
+  inline size_t bytes(const int i) { return _properties[i].global_mem; }
+  
+  /// Return the GPGPU revision number for current device
+  //inline double revision() { return revision(_device); }
+  /// Return the GPGPU revision number
+  //inline double revision(const int i) 
+  //  { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}
+  
+  /// Clock rate in GHz for current device
+  inline double clock_rate() { return clock_rate(_device); }
+  /// Clock rate in GHz
+  inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}
+  
+  /// Return the address alignment in bytes
+  inline int alignment() { return alignment(_device); }
+  /// Return the address alignment in bytes
+  inline int alignment(const int i) { return _properties[i].alignment; }
+               
+  /// Return the timer resolution
+  inline size_t timer_resolution() { return timer_resolution(_device); }
+  /// Return the timer resolution
+  inline size_t timer_resolution(const int i) 
+    { return _properties[i].timer_resolution; }
+    
+  /// Get the maximum number of threads per block
+  inline size_t group_size() { return group_size(_device); }
+  /// Get the maximum number of threads per block
+  inline size_t group_size(const int i) 
+    { return _properties[i].work_group_size; }
+  
+  /// Return the maximum memory pitch in bytes for current device
+  inline size_t max_pitch() { return max_pitch(_device); }
+  /// Return the maximum memory pitch in bytes
+  inline size_t max_pitch(const int i) { return 0; }
+
+  /// List all devices along with all properties
+  void print_all(std::ostream &out);
+  
+  /// Return the OpenCL type for the device
+  inline cl_device_id & cl_device() { return _cl_device; }
+ 
+ private:
+  int _num_platforms;          // Number of platforms
+  int _platform;               // UCL_Device ID for current platform
+  cl_platform_id _cl_platform; // OpenCL ID for current platform
+  cl_context _context;         // Context used for accessing the device
+  std::vector<cl_command_queue> _cq;// The default command queue for this device
+  int _device;                            // UCL_Device ID for current device
+  cl_device_id _cl_device;                // OpenCL ID for current device
+  std::vector<cl_device_id> _cl_devices;  // OpenCL IDs for all devices
+  int _num_devices;                       // Number of devices
+  std::vector<OCLProperties> _properties; // Properties for each device
+  
+  void add_properties(cl_device_id);
+  void create_context();
+  
+};
+
+// Grabs the properties for all devices
+inline UCL_Device::UCL_Device() {
+  cl_int errorv;
+  cl_uint nplatforms;
+  
+  _cl_device=0;
+  _device=-1;
+  _num_devices=0;
+  _platform=0;
+
+  // --- Get Number of Platforms
+  errorv=clGetPlatformIDs(1,&_cl_platform,&nplatforms);
+  
+  if (errorv!=CL_SUCCESS) {
+    _num_platforms=0;
+    return;
+  } else
+    _num_platforms=static_cast<int>(nplatforms);
+ 
+  
+  // --- Get Number of Devices
+  cl_uint n;
+  errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,NULL,&n);
+  _num_devices=n;
+  if (errorv!=CL_SUCCESS || _num_devices==0) {
+    _num_devices=0;
+    return;
+  }
+  cl_device_id device_list[_num_devices];
+  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
+                              &n));
+  
+  // --- Store properties for each device
+  for (int i=0; i<_num_devices; i++) {
+    _cl_devices.push_back(device_list[i]);
+    add_properties(device_list[i]);
+  }
+}
+
+inline UCL_Device::~UCL_Device() {
+  if (_device>-1) {
+    for (size_t i=0; i<_cq.size(); i++) {
+      CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
+      _cq.pop_back();
+    }
+    CL_SAFE_CALL(clReleaseContext(_context));
+  }
+}
+
+inline void UCL_Device::create_context() {
+  cl_int errorv;
+  cl_context_properties props[3];
+  props[0]=CL_CONTEXT_PLATFORM;
+  props[1]=_platform;
+  props[2]=0;
+  _context=clCreateContext(0,1,&_cl_device,NULL,NULL,&errorv);
+  if (errorv!=CL_SUCCESS) {
+    std::cerr << "Could not create context on device: " << name() << std::endl;
+    exit(1);
+  }
+  push_command_queue();
+}
+
+inline void UCL_Device::add_properties(cl_device_id device_list) {
+  OCLProperties op;
+  char buffer[1024];
+    
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,NULL));
+  op.name=buffer;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
+                               sizeof(op.global_mem),&op.global_mem,NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_LOCAL_MEM_SIZE,
+                               sizeof(op.shared_mem),&op.shared_mem,NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
+                               sizeof(op.const_mem),&op.const_mem,NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_TYPE,
+                               sizeof(op.device_type),&op.device_type,NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_COMPUTE_UNITS,
+                               sizeof(op.compute_units),&op.compute_units,
+                               NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CLOCK_FREQUENCY,
+                               sizeof(op.clock),&op.clock,NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_GROUP_SIZE,
+                               sizeof(op.work_group_size),&op.work_group_size,
+                               NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_ITEM_SIZES,
+                               3*sizeof(op.work_item_size[0]),op.work_item_size,
+                               NULL));
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
+                               sizeof(cl_uint),&op.alignment,NULL));
+  op.alignment/=8;                               
+  
+  // Determine if double precision is supported
+  cl_uint double_width;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
+                               sizeof(double_width),&double_width,NULL));
+  if (double_width==0)
+    op.double_precision=false;
+  else
+    op.double_precision=true;
+  
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PROFILING_TIMER_RESOLUTION,
+                               sizeof(size_t),&op.timer_resolution,NULL));
+  
+  _properties.push_back(op);
+}
+
+inline std::string UCL_Device::platform_name() {
+  char info[1024];
+  
+  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
+                                 NULL));
+  std::string ans=std::string(info)+' ';
+  
+  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
+                                 NULL));
+  ans+=std::string(info)+' ';
+  
+  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
+               NULL));
+  ans+=std::string(info);
+  
+  return ans;
+}
+
+// Get a string telling the type of the device
+inline std::string UCL_Device::device_type_name(const int i) {
+  if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
+    return "CPU";
+  else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
+    return "GPU";
+  else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
+    return "ACCELERATOR";
+  else
+    return "DEFAULT";
+}
+
+// Get a string telling the type of the device
+inline int UCL_Device::device_type(const int i) {
+  if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
+    return UCL_CPU;
+  else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
+    return UCL_GPU;
+  else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
+    return UCL_ACCELERATOR;
+  else
+    return UCL_DEFAULT;
+}
+
+// Set the CUDA device to the specified device number
+inline void UCL_Device::set(int num) {
+  if (_device==num)
+    return;
+  
+  if (_device>-1) {
+    for (size_t i=0; i<_cq.size(); i++) {
+      CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
+      _cq.pop_back();
+    }
+    CL_SAFE_CALL(clReleaseContext(_context));
+  }
+  
+  cl_device_id device_list[_num_devices];
+  cl_uint n;
+  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
+                               device_list,&n));
+
+  _device=num;
+  _cl_device=device_list[_device];
+  create_context();
+}
+
+// List all devices along with all properties
+inline void UCL_Device::print_all(std::ostream &out) {
+  if (num_devices() == 0)
+    out << "There is no device supporting OpenCL\n";
+  for (int i=0; i<num_devices(); ++i) {
+    out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
+    out << "  Type of device:                                "
+        << device_type_name(i).c_str() << std::endl;
+    out << "  Double precision support:                      ";
+    if (double_precision(i))
+      out << "Yes\n";
+    else
+      out << "No\n";
+    out << "  Total amount of global memory:                 "
+        << gigabytes(i) << " GB\n";
+    out << "  Number of compute units/multiprocessors:       "
+        << _properties[i].compute_units << std::endl;
+    //out << "  Number of cores:                               "
+    //    << cores(i) << std::endl;
+    out << "  Total amount of constant memory:               "
+        << _properties[i].const_mem << " bytes\n";
+    out << "  Total amount of local/shared memory per block: "
+        << _properties[i].shared_mem << " bytes\n";
+    //out << "  Total number of registers available per block: "
+    //    << _properties[i].regsPerBlock << std::endl;
+    //out << "  Warp size:                                     "
+    //    << _properties[i].warpSize << std::endl;
+    out << "  Maximum group size (# of threads per block)    "
+        << _properties[i].work_group_size << std::endl;
+    out << "  Maximum item sizes (# threads for each dim)    "
+        << _properties[i].work_item_size[0] << " x "
+        << _properties[i].work_item_size[1] << " x "
+        << _properties[i].work_item_size[2] << std::endl;
+    //out << "  Maximum sizes of each dimension of a grid:     "
+    //    << _properties[i].maxGridSize[0] << " x " 
+    //    << _properties[i].maxGridSize[1] << " x "
+    //    << _properties[i].maxGridSize[2] << std::endl;
+    //out << "  Maximum memory pitch:                          "
+    //    << _properties[i].memPitch) << " bytes\n";
+    //out << "  Texture alignment:                             "
+    //    << _properties[i].textureAlignment << " bytes\n";
+    out << "  Clock rate:                                    "
+        << clock_rate(i) << " GHz\n";
+    //out << "  Concurrent copy and execution:                 ";
+  }
+}
+
+}
+
+#endif
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@ -0,0 +1,254 @@
+/***************************************************************************
+                                ocl_kernel.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with OpenCL kernels
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Sun Feb 7 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef OCL_KERNEL
+#define OCL_KERNEL
+
+#include "ocl_device.h"
+#include <fstream>
+
+namespace ucl_opencl {
+    
+/// Class storing 1 or more kernel functions from a single string or file
+class UCL_Program {
+ public:
+  inline UCL_Program() : _init_done(false) {}
+  inline UCL_Program(UCL_Device &device) : _init_done(false) { init(device); }
+  inline ~UCL_Program() { clear(); }
+
+  /// Initialize the program with a device
+  inline void init(UCL_Device &device) {
+    clear();
+    _device=device.cl_device();
+    _context=device.context();
+    _cq=device.cq();
+    CL_SAFE_CALL(clRetainContext(_context)); 
+    CL_SAFE_CALL(clRetainCommandQueue(_cq));
+    _init_done=true;
+  }
+
+  /// Clear any data associated with program
+  /** \note Must call init() after each clear **/
+  inline void clear() {
+    if (_init_done) {
+      CL_SAFE_CALL(clReleaseProgram(_program)); 
+      CL_SAFE_CALL(clReleaseContext(_context));
+      CL_SAFE_CALL(clReleaseCommandQueue(_cq));
+      _init_done=false;
+    }
+  }
+
+  /// Load a program from a file and compile with flags
+  inline int load(const char *filename, const char *flags="",
+                  std::string *log=NULL) {
+    std::ifstream in(filename);
+    if (!in || in.is_open()==false) {
+      #ifndef UCL_NO_EXIT 
+      std::cerr << "UCL Error: Could not open kernel file: " 
+                << filename << std::endl;
+      exit(1);
+      #endif
+      return UCL_FILE_NOT_FOUND;
+    }
+  
+    std::string program((std::istreambuf_iterator<char>(in)),
+                        std::istreambuf_iterator<char>());
+    in.close();
+    return load_string(program.c_str(),flags,log);
+  }
+  
+  /// Load a program from a string and compile with flags
+  inline int load_string(const char *program, const char *flags="",
+                         std::string *log=NULL) {
+    cl_int error_flag;
+    const char *prog=program;
+    _program=clCreateProgramWithSource(_context,1,&prog,NULL,&error_flag);
+    CL_CHECK_ERR(error_flag);
+    error_flag = clBuildProgram(_program,1,&_device,flags,NULL,NULL);
+    cl_build_status build_status;
+    CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,
+                                       CL_PROGRAM_BUILD_STATUS, 
+                                       sizeof(cl_build_status),&build_status,
+                                       NULL));
+                                       
+    if (build_status != CL_SUCCESS || log!=NULL) {
+      size_t ms;
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0, 
+                                         NULL, &ms));
+      char build_log[ms];                                     
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
+                                         build_log, NULL));
+                                         
+      if (log!=NULL)
+        *log=std::string(build_log);
+                                                 
+      if (build_status != CL_SUCCESS) {
+        #ifndef UCL_NO_EXIT                                                 
+        std::cerr << std::endl
+                  << "----------------------------------------------------------\n"
+                  << " UCL Error: Error compiling OpenCL Program...\n"
+                  << "----------------------------------------------------------\n";
+        std::cerr << build_log << std::endl;
+        #endif
+        return UCL_COMPILE_ERROR;
+      }
+    }
+    
+    return UCL_SUCCESS;
+  }                                               
+   
+  friend class UCL_Kernel;
+ private:
+  bool _init_done;
+  cl_program _program;
+  cl_device_id _device; 
+  cl_context _context;
+  cl_command_queue _cq;
+};
+
+/// Class for dealing with OpenCL kernels
+class UCL_Kernel {
+ public:
+  UCL_Kernel() : _dimensions(1), _function_set(false), _num_args(0)
+    {  _block_size[0]=0; _num_blocks[0]=0; }
+  
+  inline UCL_Kernel(UCL_Program &program, const char *function) :
+    _dimensions(1), _function_set(false), _num_args(0)
+    {  _block_size[0]=0; _num_blocks[0]=0; set_function(program,function); }
+
+  inline ~UCL_Kernel() { clear(); }
+
+  /// Clear any function associated with the kernel
+  inline void clear() {
+    if (_function_set) {
+      clReleaseKernel(_kernel);
+      clReleaseProgram(_program);
+      clReleaseCommandQueue(_cq);
+      _function_set=false;
+    }
+  }
+
+  /// Get the kernel function from a program
+  /** \return UCL_ERROR_FLAG (UCL_SUCCESS, UCL_FILE_NOT_FOUND, UCL_ERROR) **/
+  inline int set_function(UCL_Program &program, const char *function);
+
+  /// Set the kernel argument.
+  /** If not a device pointer, this must be repeated each time the argument
+    * changes **/
+  template <class dtype>
+  inline void set_arg(const cl_uint index, dtype *arg) { 
+    CL_SAFE_CALL(clSetKernelArg(_kernel,index,sizeof(dtype),arg)); 
+    if (index>_num_args) _num_args=index;
+  }
+ 
+  /// Add a kernel argument.
+  template <class dtype>
+  inline void add_arg(dtype *arg) {
+    CL_SAFE_CALL(clSetKernelArg(_kernel,_num_args,sizeof(dtype),arg)); 
+    _num_args++; 
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  inline void set_size(const size_t num_blocks, const size_t block_size) { 
+    _dimensions=1; 
+    _num_blocks[0]=num_blocks*block_size; 
+    _block_size[0]=block_size; 
+  }
+
+  /// Set the number of thread blocks and the number of threads in each block
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, const size_t block_size_y) { 
+    _dimensions=2; 
+    _num_blocks[0]=num_blocks_x*block_size_x; 
+    _block_size[0]=block_size_x; 
+    _num_blocks[1]=num_blocks_y*block_size_y; 
+    _block_size[1]=block_size_y; 
+  }
+  
+  /// Set the number of thread blocks and the number of threads in each block
+  inline void set_size(const size_t num_blocks_x, const size_t num_blocks_y,
+                       const size_t block_size_x, 
+                       const size_t block_size_y, const size_t block_size_z) {
+    _dimensions=3; 
+    const size_t num_blocks_z=1;
+    _num_blocks[0]=num_blocks_x*block_size_x; 
+    _block_size[0]=block_size_x; 
+    _num_blocks[1]=num_blocks_y*block_size_y; 
+    _block_size[1]=block_size_y; 
+    _num_blocks[2]=num_blocks_z*block_size_z; 
+    _block_size[2]=block_size_z; 
+  }
+
+  /// Run the kernel in the default command queue
+  inline void run() {
+    run(_cq);
+  }
+  
+  /// Run the kernel in the specified command queue
+  inline void run(command_queue &cq) {
+    CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,_kernel,_dimensions,NULL,
+                                        _num_blocks,_block_size,0,NULL,NULL));
+  }
+  
+  /// Clear any arguments associated with the kernel
+  inline void clear_args() { _num_args=0; }
+
+  #include "ucl_arg_kludge.h"
+  
+ private:
+  cl_kernel _kernel;
+  cl_program _program;
+  cl_uint _dimensions;
+  size_t _block_size[3];
+  size_t _num_blocks[3];
+  bool _function_set;
+  
+  cl_command_queue _cq;        // The default command queue for this kernel
+  unsigned _num_args;
+};
+
+inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
+  clear();
+  _function_set=true;
+  _cq=program._cq;
+  CL_SAFE_CALL(clRetainCommandQueue(_cq));
+  _program=program._program;
+  CL_SAFE_CALL(clRetainProgram(_program));
+  cl_int error_flag;
+  _kernel=clCreateKernel(program._program,function,&error_flag);
+  
+  if (error_flag!=CL_SUCCESS) {
+    #ifndef UCL_NO_EXIT
+    std::cerr << "UCL Error: Could not find function: " << function
+              << " in program.\n";
+    exit(1);
+    #endif
+    return UCL_FUNCTION_NOT_FOUND;
+  }
+  return UCL_SUCCESS;                                               
+}
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/ocl_mat.h
+++ b/lib/gpu/geryon/ocl_mat.h
@ -0,0 +1,56 @@
+/***************************************************************************
+                                  ocl_mat.h
+                             -------------------
+                               W. Michael Brown
+
+  OpenCL Specific Vector/Matrix Containers, Memory Management, and I/O
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Wed Jan 13 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+/*! \file */
+   
+#ifndef OCL_MAT_H
+#define OCL_MAT_H
+
+#include "ocl_memory.h"
+
+/// Namespace for OpenCL routines
+namespace ucl_opencl {
+
+#define _UCL_MAT_ALLOW
+#define _UCL_DEVICE_PTR_MAT
+#define _OCL_MAT
+#include "ucl_basemat.h"
+#include "ucl_h_vec.h"
+#include "ucl_h_mat.h"
+#include "ucl_d_vec.h"
+#include "ucl_d_mat.h"
+#undef _UCL_DEVICE_PTR_MAT
+#undef _OCL_MAT
+#undef _UCL_MAT_ALLOW
+
+#define UCL_COPY_ALLOW
+#include "ucl_copy.h"
+#undef UCL_COPY_ALLOW
+
+#define UCL_PRINT_ALLOW
+#include "ucl_print.h"
+#undef UCL_PRINT_ALLOW
+
+} // namespace ucl_cudart 
+
+#endif
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@ -0,0 +1,59 @@
+/***************************************************************************
+                                ocl_texture.h
+                             -------------------
+                               W. Michael Brown
+
+  Utilities for dealing with OpenCL textures
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Fri Jul 2 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef OCL_TEXTURE
+#define OCL_TEXTURE
+
+#include "ocl_kernel.h"
+#include "ocl_mat.h"
+
+namespace ucl_opencl {
+    
+/// Class storing a texture reference
+class UCL_Texture {
+ public:
+  UCL_Texture() {}
+  ~UCL_Texture() {}
+  /// Construct with a specified texture reference
+  inline UCL_Texture(UCL_Program &prog, const char *texture_name) { }
+  /// Set the texture reference for this object
+  inline void get_texture(UCL_Program &prog, const char *texture_name) { }
+
+  /// Bind a float array where each fetch grabs a vector of length numel
+  template<class mat_typ>
+  inline void bind_float(mat_typ &vec, const unsigned numel) { }
+
+  /// Unbind the texture reference from the memory allocation
+  inline void unbind() { }
+
+  /// Make a texture reference available to kernel  
+  inline void allow(UCL_Kernel &kernel) { }
+  
+ private:
+  friend class UCL_Kernel;
+};
+
+} // namespace
+
+#endif
+
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -0,0 +1,111 @@
+/***************************************************************************
+                                 ocl_timer.h
+                             -------------------
+                               W. Michael Brown
+
+  Class for timing OpenCL routines
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Jan Fri 22 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef OCL_TIMER_H
+#define OCL_TIMER_H
+
+#include "ocl_macros.h"
+
+namespace ucl_opencl {
+
+/// Class for timing OpenCL events
+class UCL_Timer {
+ public:
+  inline UCL_Timer() : _total_time(0.0f), _initialized(false) { }
+  inline UCL_Timer(UCL_Device &dev) : _total_time(0.0f), _initialized(false)
+    { init(dev); }
+
+  inline ~UCL_Timer() { clear(); }
+
+  /// Clear any data associated with timer
+  /** \note init() must be called to reuse timer after a clear() **/
+  inline void clear() {
+    if (_initialized) {
+      CL_SAFE_CALL(clReleaseCommandQueue(_cq));
+      clReleaseEvent(start_event);
+      clReleaseEvent(stop_event);
+      _initialized=false;
+      _total_time=0.0;
+    }
+  }
+
+  /// Initialize default command queue for timing
+  inline void init(UCL_Device &dev) { init(dev,dev.cq()); }
+
+  /// Initialize command queue for timing
+  inline void init(UCL_Device &dev, command_queue &cq) {
+    clear();
+    t_factor=dev.timer_resolution()/1000000000.0;
+    _cq=cq;
+    clRetainCommandQueue(_cq);
+    _initialized=true;
+  }
+  
+  /// Start timing on default command queue
+  inline void start() { clEnqueueMarker(_cq,&start_event); }
+  
+  /// Stop timing on default command queue
+  inline void stop() { clEnqueueMarker(_cq,&stop_event); }
+  
+  /// Set the time elapsed to zero (not the total_time)
+  inline void zero() 
+    { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
+  
+  /// Add time from previous start and stop to total
+  /** Forces synchronization **/
+  inline double add_to_total() 
+    { double t=time(); _total_time+=t; return t/1000.0; }
+  
+  /// Return the time (ms) of last start to stop - Forces synchronization
+  inline double time() {
+    cl_ulong tstart,tend;
+    CL_SAFE_CALL(clWaitForEvents(1,&stop_event));
+    CL_SAFE_CALL(clGetEventProfilingInfo(stop_event,
+                                         CL_PROFILING_COMMAND_START,
+                                         sizeof(cl_ulong), &tend, NULL));
+    CL_SAFE_CALL(clGetEventProfilingInfo(start_event,
+                                         CL_PROFILING_COMMAND_END,
+                                         sizeof(cl_ulong), &tstart, NULL));
+    return (tend-tstart)*t_factor; 
+  }
+  
+  /// Return the time (s) of last start to stop - Forces synchronization
+  inline double seconds() { return time()/1000.0; }
+  
+  /// Return the total time in ms
+  inline double total_time() { return _total_time; }
+
+  /// Return the total time in seconds
+  inline double total_seconds() { return _total_time/1000.0; }
+
+ private:
+  cl_event start_event, stop_event;
+  cl_command_queue _cq;
+  double _total_time;
+  bool _initialized;
+  double t_factor;
+};
+
+} // namespace
+
+#endif
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@ -0,0 +1,673 @@
+/***************************************************************************
+                              ucl_arg_kludge.h
+                             -------------------
+                               W. Michael Brown
+
+  Allow multiple arguments to be added for a kernel call at a single time
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Sun Feb 7 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+  template <class t1, class t2>
+  inline void add_args(t1 *a1, t2 *a2) {
+    add_arg(a1); add_arg(a2);
+  }
+
+  template <class t1, class t2, class t3>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3) {
+    add_arg(a1); add_arg(a2); add_arg(a3);
+  }
+
+  template <class t1, class t2, class t3, class t4>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6);  
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8);  
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13);  
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+  }
+
+
+// ---------------------------------------------------------------------------
+
+  template <class t1>
+  inline void run(t1 *a1) {
+    clear_args();
+    add_arg(a1);
+    run();
+  }
+
+  template <class t1, class t2>
+  inline void run(t1 *a1, t2 *a2) {
+    clear_args();
+    add_arg(a1); add_arg(a2);
+    run();
+  }
+
+  template <class t1, class t2, class t3>
+  inline void run(t1 *a1, t2 *a2, t3 *a3) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6);  
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8);  
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13);  
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    run();
+  }
+
+// ---------------------------------------------------------------------------
+
+  template <class t1>
+  inline void run_cq(command_queue &cq, t1 *a1) {
+    clear_args();
+    add_arg(a1);
+    run(cq);
+  }
+
+  template <class t1, class t2>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2) {
+    clear_args();
+    add_arg(a1); add_arg(a2);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6);  
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8);  
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13);  
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14);  
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    run(cq);
+  }
+
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@ -0,0 +1,77 @@
+/***************************************************************************
+                                 ucl_basemat.h
+                             -------------------
+                               W. Michael Brown
+
+  Vector/Matrix Base Container
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifdef _UCL_MAT_ALLOW
+
+#include "ucl_types.h"
+
+#define UCL_H_VecT UCL_H_Vec<numtyp>
+#define UCL_H_VecD UCL_H_Vec<double>
+#define UCL_H_VecS UCL_H_Vec<float>
+#define UCL_H_VecI UCL_H_Vec<int>
+
+#define UCL_D_VecT UCL_D_Vec<numtyp>
+#define UCL_D_VecD UCL_D_Vec<double>
+#define UCL_D_VecS UCL_D_Vec<float>
+#define UCL_D_VecI UCL_D_Vec<int>
+#define UCL_D_VecI2 UCL_D_Vec<int2>
+#define UCL_D_VecU2 UCL_D_Vec<uint2>
+
+#define UCL_D_MatT UCL_D_Mat<numtyp>
+#define UCL_D_MatD UCL_D_Mat<double>
+#define UCL_D_MatS UCL_D_Mat<float>
+#define UCL_D_MatI UCL_D_Mat<int>
+
+#define UCL_ConstMatT UCL_ConstMat<numtyp>
+#define UCL_ConstMatD UCL_ConstMat<double>
+#define UCL_ConstMatS UCL_ConstMat<float>
+#define UCL_ConstMatI UCL_ConstMat<int>
+#define UCL_ConstMatD2 UCL_ConstMat<double2>
+
+/// Base class for vector/matrix containers
+/** All containers are associated with a default command queue.
+  * For CUDA, this is the default stream.
+  * 
+  * The default queue is used for asynchonrous operations on the container 
+  * that do not specify a queue. For OpenCL, this queue is also used in
+  * calls for reserving and copying memory **/ 
+class UCL_BaseMat {
+ public:
+  UCL_BaseMat() : _cq(0) { }
+  virtual ~UCL_BaseMat() { }
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+  /// Block until command_queue associated with matrix is complete
+  inline void sync() { ucl_sync(_cq); }
+  
+  #ifdef UCL_DEBUG
+  // Returns the type of host allocation
+  virtual inline enum UCL_MEMOPT kind() const { return UCL_NOT_PINNED; }
+  #endif 
+ protected:
+  command_queue _cq;
+};
+
+#endif
+
--- a/lib/gpu/geryon/ucl_copy.h
+++ b/lib/gpu/geryon/ucl_copy.h
@ -0,0 +1,826 @@
+/***************************************************************************
+                                 ucl_copy.h
+                             -------------------
+                               W. Michael Brown
+
+  Routines for copying matrix/vector data onto and off coprocessor device
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Mon Jan 4 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+   
+/***************************************************************************
+   The ucl_copy and ucl_cast_copy routines provide a general prototype for
+   copying data between host and device memory (including texture memory)
+   for the matrix and vector types in nvc_memory.
+   
+   For host/host and host/device transfers, typecasting is performed 
+   automatically as necessary. 
+   
+   The routines are written so that all branches can be removed by the 
+   compiler during template instantiation.
+   
+   The routines currently assume row-major ordering for all types.
+   
+   For asynchronous copy in the default command queue, async is boolean true;
+   For asynchronous copy in a specified command queue, async is command queue
+   Otherwise, set async to boolean false;
+   
+   When performing frequent data copies that require casting, it is more
+   efficient to allocate a casting buffer once and then pass that buffer
+   to the copy routine. This can be accomplished with the ucl_cast_copy
+   routines.
+   
+   Examples 
+      (x's represent alignment padding - to maintain alignment)
+      (o's represent a larger matrix in memory)
+      (vectors represented as single row)
+   ----------------------------------------------------------------
+       dst           src            command
+   ----------------------------------------------------------------
+    0 1 2 3 4 <-- 0 1 2 3 4          ucl_copy(dst,src,async)
+    
+    0 1 2 3   <-- 0 1 2 3 4          ucl_copy(dst,src,4,async)
+    
+    0 1 2     <-- 0 1 2 3 4 5        ucl_copy(dst,src,async)
+    3 4 5 
+   
+    0 1 2 3 4 5 <-- 0 1 2            ucl_copy(dst,src,async)
+                    3 4 5
+                    
+    0 1 2      <--  0 1 2            ucl_copy(dst,src,async)
+    3 4 5           3 4 5
+    
+    0 1 2      <--  0 1 2            ucl_copy(dst,src,6,async)
+    3 4 5           3 4 5
+                    5 6 7
+
+    0 1 2      <--  0  1  2  3       ucl_copy(dst,src,2,3,async)
+    4 5 6           4  5  6  7
+                    8  9  10 11
+    
+    0 1 2 x x  <--  0 1 2            ucl_copy(dst,src,async)
+    3 4 5 x x       3 4 5
+    
+    0 1 2      <--  0 1 2 x x        ucl_copy(dst,src,async)
+    3 4 5           3 4 5 x x
+    
+    0 1 2 o o  <--  0 1 2            ucl_copy(dst,src,2,3,async)
+    3 4 5 o o       3 4 5
+    o o o o o       
+
+    0 1 2 o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,3,async)
+    3 4 5 o o       
+    o o o o o       
+
+    0 1 o o o  <--  0 1 2 3 4 5      ucl_copy(dst,src,2,2,async)
+    2 3 o o o       
+    o o o o o       
+
+    0 1 2 o o  <--  0  1  2  3  4    ucl_copy(dst,src,2,3,async)
+    5 6 7 o o       5  6  7  8  9
+    o o o o o       10 11 12 13 14
+    
+    0 1 2 5 6 7  <--  0  1  2  3  4  ucl_copy(dst,src,2,3,async)
+                      5  6  7  8  9
+                      10 11 12 13 14
+    
+ ***************************************************************************/
+
+// Only allow this file to be included by nvc_memory.h and ocl_memory.h
+#ifdef UCL_COPY_ALLOW
+
+// --------------------------------------------------------------------------
+// - HOST-HOST COPY ROUTINES
+// --------------------------------------------------------------------------
+
+// Have to use specialization because some types don't have operator[]
+template <int host_t1, int host_t2> struct _host_host_copy;
+
+// Both on host
+template <> struct _host_host_copy<1,1> {
+  template <class mat1, class mat2>
+  static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
+    #ifdef UCL_DEBUG
+    assert(mat1::PADDED==0 && mat2::PADDED==0);
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    #endif
+    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
+      memcpy(dst.begin(),src.begin(),numel*sizeof(typename mat1::data_type));
+    else
+      for (size_t i=0; i<numel; i++)
+        dst[i]=static_cast<typename mat1::data_type>(src[i]);
+  }
+  template <class mat1, class mat2>
+  static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
+                         const size_t cols) {
+    #ifdef UCL_DEBUG
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    #endif
+    size_t dst_row_size, src_row_size;
+    if (mat1::VECTOR)
+      dst_row_size=cols;
+    else
+      dst_row_size=dst.row_size();
+    if (mat2::VECTOR)
+      src_row_size=cols;
+    else
+      src_row_size=src.row_size();
+    if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE && mat1::DATA_TYPE!=0)
+      for (size_t i=0; i<rows; i++)
+        memcpy(dst.begin()+i*dst_row_size,src.begin()+i*src_row_size,
+               cols*sizeof(typename mat1::data_type));
+    else
+      for (size_t j=0; j<rows; j++) {
+        int dst_i=j*dst_row_size;
+        int d_end=dst_i+cols;
+        int src_i=j*src_row_size;
+        for (; dst_i<d_end; dst_i++) {
+          dst[dst_i]=static_cast<typename mat1::data_type>(src[src_i]);
+          src_i++;
+        }
+      }
+  }
+};
+
+// Should never be here
+template <int host_t1, int host_t2> struct _host_host_copy {
+  template <class mat1, class mat2>
+  static inline void hhc(mat1 &dst, const mat2 &src, const size_t numel) {
+    assert(0==1);
+  }
+  template <class mat1, class mat2>
+  static inline void hhc(mat1 &dst, const mat2 &src, const size_t rows,
+                         const size_t cols) {
+    assert(0==1);
+  }                         
+};
+
+// --------------------------------------------------------------------------
+// - TEMPLATE HELPER FUNCTIONS FOR SPECIALIZED CASTING
+// --------------------------------------------------------------------------
+
+// Helper functions for ucl_cast_copy
+template <int host_type1, int host_type2> struct _ucl_cast_copy;
+
+// Destination is on host
+template <int host_type2> struct _ucl_cast_copy<1,host_type2> {
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer) {
+    ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type));
+    for (size_t i=0; i<numel; i++)
+      dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer,command_queue &cq) {
+    ucl_mv_cpy(cast_buffer,src,numel*sizeof(typename mat2::data_type),cq);
+    cast_buffer.sync();
+    for (size_t i=0; i<numel; i++)
+      dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer) {
+    // Asynchronous currently pointless here 
+    #ifdef UCL_DEBUG
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
+    #endif    
+    if (mat1::VECTOR) {
+      ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
+                 src.row_bytes(),cols*sizeof(typename mat2::data_type),rows);
+      for (size_t i=0; i<rows*cols; i++)
+        dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
+    } else {
+      if (mat2::VECTOR) 
+        ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
+                   cols*sizeof(typename mat2::data_type),
+                   cols*sizeof(typename mat2::data_type),rows);
+      else
+        ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
+                   src.row_bytes(),cols*sizeof(typename mat2::data_type),
+                   rows);
+      int dst_i=0;
+      int buff_i=0;
+      for (size_t i=0; i<rows; i++) {
+        for (size_t j=0; j<cols; j++) {
+          dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
+          buff_i++;
+          dst_i++;
+        }
+        dst_i+=dst.cols()-cols;
+      }
+    }
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer, 
+                        command_queue &cq) {
+    // Asynchronous currently pointless here 
+    #ifdef UCL_DEBUG
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    assert(dst.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
+    #endif    
+    if (mat1::VECTOR) {
+      ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
+                 src.row_bytes(),cols*sizeof(typename mat2::data_type),rows,cq);
+      cast_buffer.sync();           
+      for (size_t i=0; i<rows*cols; i++)
+        dst[i]=static_cast<typename mat1::data_type>(cast_buffer[i]);
+    } else {
+      if (mat2::VECTOR) 
+        ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
+                   cols*sizeof(typename mat2::data_type),
+                   cols*sizeof(typename mat2::data_type),rows,cq);
+      else
+        ucl_mv_cpy(cast_buffer,cols*sizeof(typename mat2::data_type),src,
+                   src.row_bytes(),cols*sizeof(typename mat2::data_type),
+                   rows,cq);
+      cast_buffer.sync();
+      int dst_i=0;
+      int buff_i=0;
+      for (size_t i=0; i<rows; i++) {
+        for (size_t j=0; j<cols; j++) {
+          dst[dst_i]=static_cast<typename mat1::data_type>(cast_buffer[buff_i]);
+          buff_i++;
+          dst_i++;
+        }
+        dst_i+=dst.cols()-cols;
+      }
+    }
+  }
+};
+
+// Source is on host
+template <int host_type1> struct _ucl_cast_copy<host_type1,1> {
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer) {
+    for (size_t i=0; i<numel; i++)
+      cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+    ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type));
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer, command_queue &cq) {
+    for (size_t i=0; i<numel; i++)
+      cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+    ucl_mv_cpy(dst,cast_buffer,numel*sizeof(typename mat1::data_type),cq);
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer) {
+    #ifdef UCL_DEBUG
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);
+    #endif
+    if (mat2::VECTOR) {
+      for (size_t i=0; i<rows*cols; i++)
+        cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
+                 cols*sizeof(typename mat1::data_type),
+                 cols*sizeof(typename mat1::data_type),rows);
+    } else if (mat1::VECTOR) {
+      int src_i=0;
+      int buf_i=0;
+      for (size_t i=0; i<rows; i++) {
+        for (size_t j=0; j<cols; j++) {
+          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
+          buf_i++;
+          src_i++;
+        }
+        src_i+=src.cols()-cols;
+      }
+      ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows);
+    } else {
+      int src_i=0;
+      int buf_i=0;
+      for (size_t i=0; i<rows; i++) {
+        for (size_t j=0; j<cols; j++) {
+          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
+          buf_i++;
+          src_i++;
+        }
+        src_i+=src.cols()-cols;
+      }
+      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
+                 cols*sizeof(typename mat1::data_type),
+                 cols*sizeof(typename mat1::data_type),rows);
+    }
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer,
+                        command_queue &cq) {
+    #ifdef UCL_DEBUG
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    assert(src.numel()>=rows*cols && cast_buffer.numel()>=rows*cols);
+    if (mat1::VECTOR==0) assert(dst.rows()>=rows && dst.cols()>=cols);    
+    if (mat2::VECTOR==0) assert(src.rows()>=rows && src.cols()>=cols);    
+    #endif
+    if (mat2::VECTOR) {
+      for (size_t i=0; i<rows*cols; i++)
+        cast_buffer[i]=static_cast<typename mat3::data_type>(src[i]);
+      ucl_mv_cpy(dst,dst.row_bytes(),
+                 cast_buffer,cols*sizeof(typename mat1::data_type),
+                 cols*sizeof(typename mat1::data_type),rows,cq);
+    } else if (mat1::VECTOR) {
+      int src_i=0;
+      int buf_i=0;
+      for (size_t i=0; i<rows; i++) {
+        for (size_t j=0; j<cols; j++) {
+          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
+          buf_i++;
+          src_i++;
+        }
+        src_i+=src.cols()-cols;
+      }
+      ucl_mv_cpy(dst,cast_buffer,cols*sizeof(typename mat1::data_type)*rows,cq);
+    } else {
+      int src_i=0;
+      int buf_i=0;
+      for (size_t i=0; i<rows; i++) {
+        for (size_t j=0; j<cols; j++) {
+          cast_buffer[buf_i]=static_cast<typename mat3::data_type>(src[src_i]);
+          buf_i++;
+          src_i++;
+        }
+        src_i+=src.cols()-cols;
+      }
+      ucl_mv_cpy(dst,dst.row_bytes(),cast_buffer,
+                 cols*sizeof(typename mat1::data_type),
+                 cols*sizeof(typename mat1::data_type),rows,cq);
+    }
+  }
+};
+
+// Neither on host or both on host
+template <> struct _ucl_cast_copy<1,1> {
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer, command_queue &cq) {
+    assert(0==1);                        
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer) {
+    assert(0==1);                        
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer) {
+    assert(0==1);                        
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer,
+                        command_queue &cq) {
+    assert(0==1);                        
+  }
+};
+
+// Neither on host or both on host
+template <> struct _ucl_cast_copy<0,0> {
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer, command_queue &cq) {
+    assert(0==1);                        
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t numel,
+                        mat3 &cast_buffer) {
+    assert(0==1);                        
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer) {
+    assert(0==1);                        
+  }
+  template <class mat1, class mat2, class mat3>
+  static inline void cc(mat1 &dst, const mat2 &src, const size_t rows,
+                        const size_t cols, mat3 &cast_buffer,
+                        command_queue &cq) {
+    assert(0==1);                        
+  }
+};
+
+// --------------------------------------------------------------------------
+// - 1D COPY - SPECIFIED NUMBER OF BYTES
+// --------------------------------------------------------------------------
+
+/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
+/** \param numel Number of elements (not bytes) to copy
+  * \param cast_buffer Buffer on host with enough storage for casting
+  * - If the data types for the two matrices are same, no cast performed
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - Currently does not handle textures **/
+template <class mat1, class mat2, class mat3>
+inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
+                          mat3 &cast_buffer, command_queue &cq) {
+  #ifdef UCL_DEBUG
+  assert(dst.numel()>=numel && src.numel()>=numel);
+  assert(cast_buffer.numel()>=numel);
+  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  #endif
+  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,numel,cq);
+  else
+    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
+                                                      cast_buffer,cq);
+}
+
+/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
+/** \param numel Number of elements (not bytes) to copy
+  * \param async Perform non-blocking copy on default stream
+  * \param cast_buffer Buffer on host with enough storage for casting
+  * - If the data types for the two matrices are same, no cast performed
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - Currently does not handle textures **/
+template <class mat1, class mat2, class mat3>
+inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t numel,
+                          mat3 &cast_buffer, const bool async) {
+  #ifdef UCL_DEBUG
+  assert(dst.numel()>=numel && src.numel()>=numel);
+  assert(cast_buffer.numel()>=numel);
+  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  #endif
+  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,numel,async);
+  else if (async)
+    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
+                                                      cast_buffer,dst.cq());
+  else
+    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
+                                                      cast_buffer);
+}
+
+/// Asynchronous copy of matrix/vector (memory already allocated)
+/** \param numel Number of elements (not bytes) to copy
+  * - If the data types of the two matrices are not the same,
+  *   casting will be performed automatically as long as the copy is
+  *   not device to device. For host/device transfers, a temporary
+  *   buffer is created for copy. When multiple casts occur, it is
+  *   more efficient to create a permanent casting buffer that can
+  *   be passed to an alternative  copy routine.
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - Currently does not handle textures **/
+template <class mat1, class mat2>
+inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
+                     command_queue &cq) {
+  #ifdef UCL_DEBUG
+  assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
+  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  #endif
+  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
+    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+      (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
+    if (mat1::MEM_TYPE==1) {
+      UCL_H_Vec<typename mat2::data_type> cast_buffer;
+      cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
+                                                        cast_buffer,cq);
+    } else {
+      UCL_H_Vec<typename mat1::data_type> cast_buffer;
+      cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
+                                                        cast_buffer,cq);
+    }
+  } else 
+    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type),cq); 
+}
+
+/// Copy matrix/vector (memory already allocated)
+/** \param numel Number of elements (not bytes) to copy
+  * \param async Perform non-blocking copy (ignored for host to host copy)
+  * - If the data types of the two matrices are not the same,
+  *   casting will be performed automatically as long as the copy is
+  *   not device to device. For host/device transfers, a temporary
+  *   buffer is created for copy. When multiple casts occur, it is
+  *   more efficient to create a permanent casting buffer that can
+  *   be passed to an alternative  copy routine.
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - The default stream is used for asynchronous copy
+  * - Currently does not handle textures **/
+template <class mat1, class mat2>
+inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t numel,
+                     const bool async) {
+  #ifdef UCL_DEBUG
+  assert(dst.row_size()*dst.rows()>=numel && src.row_size()*src.rows()>=numel);
+  assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+  #endif
+  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
+    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,numel);
+  else if (async)
+    ucl_copy(dst,src,numel,dst.cq());
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE &&
+           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
+    if (mat1::MEM_TYPE==1) {
+      UCL_H_Vec<typename mat2::data_type> cast_buffer;
+      cast_buffer.alloc(numel,dst,UCL_RW_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
+                                                        cast_buffer);
+    } else {
+      UCL_H_Vec<typename mat1::data_type> cast_buffer;
+      cast_buffer.alloc(numel,dst,UCL_WRITE_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,numel,
+                                                        cast_buffer);
+    }
+  } else
+    ucl_mv_cpy(dst,src,numel*sizeof(typename mat2::data_type)); 
+}
+
+// --------------------------------------------------------------------------
+// - 2D COPY - SPECIFIED NUMBER OF ROWS/COLS
+// --------------------------------------------------------------------------
+
+/// Asynchronous copy subset matrix rows/cols with cast (Device/Host transfer)
+/** \param async Perform non-blocking copy on default stream
+  * \param cast_buffer Buffer on host with enough storage for casting
+  * - If src is a vector, routine assumes row-major rows by cols copy
+  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If dst is a vector, routine assumes row-major rows by cols copy
+  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If the data types for the two matrices are same, no cast performed
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - Copy from vector to matrix and vice versa allowed
+  * - Currently does not handle textures **/
+template <class mat1, class mat2, class mat3>
+inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
+                          const size_t cols, mat3 &cast_buffer,
+                          const bool async) {
+  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,rows,cols,async);
+  else if (async)
+    ucl_copy(dst,src,rows,cols,dst.cq());
+  else
+    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
+                                                      cast_buffer);
+}
+
+/// Asynchronous copy subset matrix rows,cols with cast (Device/Host transfer)
+/** \param cast_buffer Buffer on host with enough storage for casting
+  * - If src is a vector, routine assumes row-major rows by cols copy
+  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If dst is a vector, routine assumes row-major rows by cols copy
+  * - If dst is a matrix, routine will copy into upper left tile of matrix 
+  * - If the data types for the two matrices are same, no cast performed
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - Copy from vector to matrix and vice versa allowed
+  * - Currently does not handle textures **/
+template <class mat1, class mat2, class mat3>
+inline void ucl_cast_copy(mat1 &dst, const mat2 &src, const size_t rows,
+                          const size_t cols, mat3 &cast_buffer, 
+                          command_queue &cq) {
+  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,rows,cols,cq);
+  else 
+    _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
+                                                      cast_buffer,cq);
+}
+
+/// Asynchronous copy of subset matrix rows,cols (memory already allocated)
+/** - If src is a vector, routine assumes row-major rows by cols copy
+  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If dst is a vector, routine assumes row-major rows by cols copy
+  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If the data types of the two matrices are not the same,
+  *   casting will be performed automatically as long as the copy is 
+  *   not device to device. For host/device transfers, a temporary
+  *   buffer is created for copy. When multiple casts occur, it is
+  *   more efficient to create a permanent casting buffer that can
+  *   be passed to an alternative copy routine.
+  * - The copy should handle padding for 2D alignment correctly
+  * - Copy from vector to matrix and vice versa allowed
+  * - Currently does not handle textures **/
+template <class mat1, class mat2>
+inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
+                     const size_t cols, command_queue &cq) {
+  if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
+    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
+    if (mat1::MEM_TYPE==1) {
+      UCL_H_Vec<typename mat2::data_type> cast_buffer;
+      cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
+                                                        cast_buffer,cq);
+    } else {
+      UCL_H_Vec<typename mat1::data_type> cast_buffer;
+      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
+                                                        cast_buffer,cq);
+    }
+  // If we are here, at least one of the matrices must have VECTOR=0
+  } else if (mat1::VECTOR) {
+    #ifdef UCL_DEBUG
+    assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    #endif
+    ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
+                               cols*sizeof(typename mat1::data_type),rows,
+                               cq);
+  } else if (mat2::VECTOR) {
+    #ifdef UCL_DEBUG
+    assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    #endif
+    ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
+               cols*sizeof(typename mat1::data_type),rows,cq);
+  } else {
+    #ifdef UCL_DEBUG
+    assert(src.rows()>=rows && src.cols()>=cols);
+    assert(dst.rows()>=rows && dst.cols()>=cols);
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    #endif
+    ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
+               cols*sizeof(typename mat1::data_type),rows,cq);
+  }
+}
+
+/// Copy subset of matrix rows,cols (memory already allocated)
+/** \param async Perform non-blocking copy (ignored for host to host copy)
+  * - If src is a vector, routine assumes row-major rows by cols copy
+  * - If src is a matrix, routine will copy upper left tile of matrix 
+  * - If dst is a vector, routine assumes row-major rows by cols copy
+  * - If dst is a matrix, routine will copy into left tile of matrix 
+  * - If the data types of the two matrices are not the same,
+  *   casting will be performed automatically as long as the copy is
+  *   not device to device. For host/device transfers, a temporary
+  *   buffer is created for copy. When multiple casts occur, it is
+  *   more efficient to create a permanent casting buffer that can
+  *   be passed to an alternative  copy routine.
+  * - The copy should handle padding for 2D alignment correctly
+  * - Copy from vector to matrix and vice versa allowed
+  * - The default stream is used for asynchronous copy
+  * - Currently does not handle textures **/
+template <class mat1, class mat2>
+inline void ucl_copy(mat1 &dst, const mat2 &src, const size_t rows,
+                     const size_t cols, const bool async) {
+  if (async)
+    ucl_copy(dst,src,rows,cols,dst.cq());
+  else if (mat1::MEM_TYPE==1 && mat2::MEM_TYPE==1)
+    _host_host_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::hhc(dst,src,rows,cols);
+  else if ((int)mat1::DATA_TYPE!=(int)mat2::DATA_TYPE && 
+           (mat1::MEM_TYPE==1 || mat2::MEM_TYPE==1)) {
+    if (mat1::MEM_TYPE==1) {
+      UCL_H_Vec<typename mat2::data_type> cast_buffer;
+      cast_buffer.alloc(rows*cols,dst,UCL_RW_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
+                                                        cast_buffer);
+    } else {
+      UCL_H_Vec<typename mat1::data_type> cast_buffer;
+      cast_buffer.alloc(rows*cols,dst,UCL_WRITE_OPTIMIZED);
+      _ucl_cast_copy<mat1::MEM_TYPE,mat2::MEM_TYPE>::cc(dst,src,rows,cols,
+                                                        cast_buffer);
+    }
+  // If we are here, at least one of the matrices must have VECTOR=0
+  } else if (mat1::VECTOR) {
+    #ifdef UCL_DEBUG
+    assert(dst.numel()>=rows*cols && src.rows()>=rows && src.cols()>=cols);
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    assert(mat2::VECTOR==0);
+    #endif
+    ucl_mv_cpy(dst,cols*sizeof(typename mat1::data_type),src,src.row_bytes(),
+                   cols*sizeof(typename mat1::data_type),rows);
+  } else if (mat2::VECTOR) {
+    #ifdef UCL_DEBUG
+    assert(src.numel()>=rows*cols && dst.rows()>=rows && dst.cols()>=cols);
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    assert(mat1::VECTOR==0);
+    #endif
+    ucl_mv_cpy(dst,dst.row_bytes(),src,cols*sizeof(typename mat1::data_type),
+               cols*sizeof(typename mat1::data_type),rows);
+  } else {
+    #ifdef UCL_DEBUG
+    assert(src.rows()>=rows && src.cols()>=cols);
+    assert(dst.rows()>=rows && dst.cols()>=cols);
+    assert(mat1::ROW_MAJOR==1 && mat2::ROW_MAJOR==1);
+    #endif
+    ucl_mv_cpy(dst,dst.row_bytes(),src,src.row_bytes(),
+               cols*sizeof(typename mat1::data_type),rows);
+  }
+}
+
+// --------------------------------------------------------------------------
+// - 1D/2D COPY
+// --------------------------------------------------------------------------
+
+/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
+/** \param async Perform non-blocking copy on default stream
+  * \param cast_buffer Buffer on host with enough storage for casting
+  * - If the data types for the two matrices are same, no cast performed
+  * - The number of bytes copied is determined by entire src data
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - Copy from vector to matrix and vice versa allowed
+  * - Currently does not handle textures **/
+template <class mat1, class mat2, class mat3>
+inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
+                          mat3 &cast_buffer, const bool async) {
+  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,async);
+  else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
+    ucl_cast_copy(dst,src,src.rows(),src.cols(),cast_buffer,async);
+  else if (mat1::PADDED==1)
+    ucl_cast_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,async);
+  else
+    ucl_cast_copy(dst,src,src.numel(),cast_buffer,async);
+}
+
+/// Asynchronous copy of matrix/vector with cast (Device/Host transfer)
+/** \param cast_buffer Buffer on host with enough storage for casting
+  * - If the data types for the two matrices are same, no cast performed
+  * - The number of bytes copied is determined by entire src data
+  * - Padding for 2D matrices is not considered in this routine. 
+  * - Copy from vector to matrix and vice versa allowed
+  * - Currently does not handle textures **/
+template <class mat1, class mat2, class mat3>
+inline void ucl_cast_copy(mat1 &dst, const mat2 &src,
+                          mat3 &cast_buffer, command_queue &cq) {
+  if ((int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,cq);
+  else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
+    ucl_copy(dst,src,src.rows(),src.cols(),cast_buffer,cq);
+  else if (mat1::PADDED==1)
+    ucl_copy(dst,src,dst.rows(),dst.cols(),cast_buffer,cq);
+  else
+    ucl_copy(dst,src,src.numel(),cast_buffer,cq);
+}
+
+/// Asynchronous copy of matrix/vector (memory already allocated)
+/** - The number of bytes copied is determined by entire src data
+  * - If the data types of the two matrices are not the same,
+  *   casting will be performed automatically as long as the copy is 
+  *   not device to device. For host/device transfers, a temporary
+  *   buffer is created for copy. When multiple casts occur, it is
+  *   more efficient to create a permanent casting buffer that can
+  *   be passed to an alternative copy routine.
+  * - The copy should handle padding for 2D alignment correctly
+  * - Copy from vector to matrix and vice versa allowed
+  * - Currently does not handle textures **/
+template <class mat1, class mat2>
+inline void ucl_copy(mat1 &dst, const mat2 &src, command_queue &cq) {
+  if (dst.row_bytes()==src.row_bytes() &&
+      src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
+      (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,src.row_size()*src.rows(),cq);
+  else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
+    ucl_copy(dst,src,src.rows(),src.cols(),cq);
+  else if (mat1::PADDED==1)
+    ucl_copy(dst,src,dst.rows(),dst.cols(),cq);
+  else
+    ucl_copy(dst,src,src.numel(),cq);
+}
+
+/// Copy matrix/vector (memory already allocated)
+/** \param async Perform non-blocking copy (ignored for host to host copy)
+  * - The number of bytes copied is determined by entire src data
+  * - If the data types of the two matrices are not the same,
+  *   casting will be performed automatically as long as the copy is
+  *   not device to device. For host/device transfers, a temporary
+  *   buffer is created for copy. When multiple casts occur, it is
+  *   more efficient to create a permanent casting buffer that can
+  *   be passed to an alternative  copy routine.
+  * - The copy should handle padding for 2D alignment correctly
+  * - Copy from vector to matrix and vice versa allowed
+  * - The default stream is used for asynchronous copy
+  * - Currently does not handle textures **/
+template <class mat1, class mat2>
+inline void ucl_copy(mat1 &dst, const mat2 &src, const bool async) {
+  if (async)
+    ucl_copy(dst,src,dst.cq());
+  else if (dst.row_bytes()==src.row_bytes() && 
+           src.kind()!=UCL_VIEW && dst.kind()!=UCL_VIEW &&
+           (int)mat1::DATA_TYPE==(int)mat2::DATA_TYPE)
+    ucl_copy(dst,src,src.row_size()*src.rows(),async);
+  else if (mat2::PADDED==1 || (mat1::PADDED==1 && mat2::VECTOR==0) )
+    ucl_copy(dst,src,src.rows(),src.cols(),async);
+  else if (mat1::PADDED==1)
+    ucl_copy(dst,src,dst.rows(),dst.cols(),async);
+  else
+    ucl_copy(dst,src,src.numel(),async);
+}
+
+#endif
+
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@ -0,0 +1,430 @@
+/***************************************************************************
+                                 ucl_d_mat.h
+                             -------------------
+                               W. Michael Brown
+
+  Matrix Container on Device
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifdef _UCL_MAT_ALLOW
+
+/// 2D Matrix on device (can have extra column storage to get correct alignment)
+template <class numtyp>
+class UCL_D_Mat : public UCL_BaseMat {
+ public:
+  // Traits for copying data
+  // MEM_TYPE is 0 for device, 1 for host, and 2 for image
+  enum traits {
+    DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
+    MEM_TYPE = 0,
+    PADDED = 1,
+    ROW_MAJOR = 1,
+    VECTOR = 0
+  };
+  typedef numtyp data_type; 
+
+  UCL_D_Mat() : _rows(0), _kind(UCL_VIEW) {}
+  ~UCL_D_Mat() { if (_kind!=UCL_VIEW) _device_free(*this); }
+  
+  /// Construct with specified rows and cols
+  /** \sa alloc() **/
+  UCL_D_Mat(const size_t rows, const size_t cols, UCL_Device &device,
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
+    _rows(0), _kind(UCL_VIEW) { alloc(rows,cols,device,kind); }
+    
+  /// Row major matrix on device
+  /** The kind parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \param cq Default command queue for operations copied from another mat 
+    * \note - Coalesced access using adjacent cols on same row
+    *         UCL_D_Mat(row,col) given by array[row*row_size()+col]
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  template <class mat_type>
+  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
+    clear();
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
+    #endif
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate " 
+                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
+      exit(1);
+    }
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }
+  
+  /// Row major matrix on device
+  /** The kind parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \param device Used to get the default command queue for operations
+    * \note - Coalesced access using adjacent cols on same row
+    *         UCL_D_Mat(row,col) given by array[row*row_size()+col]
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
+    clear();
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
+    #endif
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate "
+                << rows*cols*sizeof(numtyp) << " bytes on device.\n";
+      exit(1);
+    }
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }
+  
+  /// Return the type of memory allocation
+  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
+  inline enum UCL_MEMOPT kind() const { return _kind; }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols,
+                   const size_t stride) {
+    clear();
+    _kind=UCL_VIEW;
+    _rows=rows;
+    _cols=cols;
+    _pitch=stride*sizeof(numtyp);
+    _row_size=stride;
+    this->_cq=input.cq();
+    #ifdef _OCL_MAT
+    _offset=0;
+    _array=input.cbegin();
+    #else
+    _device_view(&_array,input.begin());
+    #endif
+    
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+  }
+
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
+    { view(input,rows,cols,input.row_size()); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t cols)
+    { view(input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view(ucl_type &input) 
+    { view(input,input.rows(),input.cols()); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view(ptr_type input, const size_t rows, const size_t cols,
+                   const size_t stride, UCL_Device &dev) { 
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _rows=rows;
+    _pitch=stride*sizeof(numtyp);
+    _row_size=stride;
+    this->_cq=dev.cq();
+    _array=input;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+  }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view(ptr_type input, const size_t rows, const size_t cols,
+                   UCL_Device &dev) { view(input,rows,cols,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
+    { view(input,1,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols, const size_t stride) {
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _rows=rows;
+    _pitch=stride*sizeof(numtyp);
+    _row_size=stride;
+    this->_cq=input.cq();
+    #ifdef _OCL_MAT
+    _array=input.begin();
+    _offset=offset;
+    #else
+    _device_view(&_array,input.begin(),offset,sizeof(numtyp));
+    #endif
+    
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+  }
+
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols) 
+    { view_offset(offset,input,rows,cols,input.row_size()); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
+    { view_offset(offset,input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view_offset(const size_t offset, ucl_type &input) { 
+    if (input.rows()==1) 
+      view_offset(offset,input,1,input.cols()-offset);
+    else 
+      view_offset(offset,input,input.rows()-offset/input.row_size(),
+                  input.cols());
+  }
+    
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
+                          const size_t cols,const size_t stride,
+                          UCL_Device &dev) { 
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _rows=rows;
+    _pitch=stride*sizeof(numtyp);
+    _row_size=stride;
+    this->_cq=dev.cq();
+    
+    #ifdef _OCL_MAT
+    _array=input;
+    _offset=offset;
+    #else
+    #ifdef _UCL_DEVICE_PTR_MAT
+    _array=input+offset*sizeof(numtyp);
+    #else
+    _array=input+offset;
+    #endif
+    #endif
+    
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+  }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
+                          const size_t cols, UCL_Device &dev) 
+    { view_offset(offset,input,rows,cols,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view_offset(const size_t offset, ptr_type input, 
+                          const size_t cols, UCL_Device &dev)
+    { view_offset(offset,input,1,cols,dev); }
+  
+  /// Free memory and set size to 0
+  inline void clear() 
+    { _rows=0; if (_kind!=UCL_VIEW) { _kind=UCL_VIEW; _device_free(*this); } }
+
+  /// Set each element to zero
+  inline void zero() { _device_zero(*this,row_bytes()*_rows); }
+  
+  /// Set first n elements to zero
+  inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
+
+  #ifdef _UCL_DEVICE_PTR_MAT
+  /// For OpenCL, returns a (void *) device pointer to memory allocation
+  inline device_ptr & begin() { return _array; }
+  /// For OpenCL, returns a (void *) device pointer to memory allocation
+  inline const device_ptr & begin() const { return _array; }
+  #else
+  /// For CUDA-RT, get device pointer to first element
+  inline numtyp * begin() { return _array; }
+  /// For CUDA-RT, get device pointer to first element
+  inline const numtyp * begin() const { return _array; }
+  /// For CUDA-RT, get device pointer to one past last element
+  inline numtyp * end() { return _end; }
+  /// For CUDA-RT, get device pointer to one past last element
+  inline const numtyp * end() const { return _end; }
+  #endif
+
+  #ifdef _UCL_DEVICE_PTR_MAT
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns void** **/
+  inline device_ptr & cbegin() { return _array; }
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns void** **/
+  inline const device_ptr & cbegin() const { return _array; }
+  #else
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns numtyp** **/
+  inline numtyp ** cbegin() { return &_array; }
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns numtyp** **/
+  inline const numtyp ** cbegin() const { return &_array; }
+  #endif
+
+  /// Get the number of elements
+  inline size_t numel() const { return _cols*_rows; }
+  /// Get the number of rows
+  inline size_t rows() const { return _rows; }
+  /// Get the number of columns
+  inline size_t cols() const { return _cols; }
+  ///Get the size of a row (including any padding) in elements
+  inline size_t row_size() const { return _row_size; }
+  /// Get the size of a row (including any padding) in bytes
+  inline size_t row_bytes() const { return _pitch; }
+  /// Get the size in bytes of 1 element
+  inline int element_size() const { return sizeof(numtyp); }
+  
+  #ifdef _OCL_MAT
+  /// Return the offset (in elements) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t offset() const { return _offset; }
+  #else
+  /// Return the offset (in elements) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t offset() const { return 0; }
+  #endif
+
+  /// Return the offset (in bytes) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t byteoff() const { return offset()*sizeof(numtyp); }
+  
+ private:
+  size_t _pitch, _row_size, _rows, _cols;
+  enum UCL_MEMOPT _kind;
+
+  #ifdef _UCL_DEVICE_PTR_MAT
+  device_ptr _array;
+  #else
+  numtyp *_array,*_end;
+  #endif
+
+  #ifdef _OCL_MAT
+  size_t _offset;
+  #endif
+};
+
+#endif
+
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -0,0 +1,442 @@
+/***************************************************************************
+                                 ucl_d_vec.h
+                             -------------------
+                               W. Michael Brown
+
+  Vector Container on Device
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifdef _UCL_MAT_ALLOW
+
+/// Row vector on device 
+template <class numtyp>
+class UCL_D_Vec : public UCL_BaseMat {
+ public:
+  // Traits for copying data
+  // MEM_TYPE is 0 for device, 1 for host, and 2 for image
+  enum traits {
+    DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
+    MEM_TYPE = 0,
+    PADDED = 0,
+    ROW_MAJOR = 1,
+    VECTOR = 1
+  };
+  typedef numtyp data_type; 
+
+  UCL_D_Vec() : _cols(0), _kind(UCL_VIEW) {}
+  ~UCL_D_Vec() { if (_kind!=UCL_VIEW) _device_free(*this); }
+
+  /// Construct with n columns
+  /** \sa alloc() **/
+  UCL_D_Vec(const size_t n, UCL_Device &device,
+            const enum UCL_MEMOPT kind=UCL_READ_WRITE) : 
+    _cols(0), _kind(UCL_VIEW) { alloc(n,device,kind); }
+
+  /// Set up host vector with 'cols' columns and reserve memory
+  /** The kind parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \param cq Default command queue for operations copied from another mat
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  template <class mat_type>
+  inline int alloc(const size_t cols, mat_type &cq,
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
+                        
+    clear();
+    _kind=kind;
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_device_alloc(*this,cq,_row_bytes,kind);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
+    #endif
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on device.\n";
+      exit(1);
+    }
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }    
+
+  /// Set up host vector with 'cols' columns and reserve memory
+  /** The kind parameter controls memory optimizations as follows:
+    * - UCL_READ_WRITE - Specify that you will read and write in kernels
+    * - UCL_WRITE_ONLY - Specify that you will only write in kernels
+    * - UCL_READ_ONLY  - Specify that you will only read in kernels
+    * \param device Used to get the default command queue for operations
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  inline int alloc(const size_t cols, UCL_Device &device,
+                   const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
+    clear();
+    _kind=kind;
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    int err=_device_alloc(*this,device,_row_bytes,kind);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
+    #endif
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on device.\n";
+      exit(1);
+    }
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+    return err; 
+  }
+
+  /// Return the type of memory allocation
+  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
+  inline enum UCL_MEMOPT kind() const { return _kind; }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+    #ifdef UCL_DEBUG
+    assert(rows==1);
+    #endif
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _row_bytes=_cols*sizeof(numtyp);
+    this->_cq=input.cq();
+    #ifdef _OCL_MAT
+    _offset=0;
+    _array=input.cbegin();
+    #else
+    _device_view(&_array,input.begin());
+    #endif
+    
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols,
+                   const size_t stride) { view(input,rows,cols); }
+
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t cols)
+    { view(input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view(ucl_type &input) 
+    { view(input,input.rows()*input.row_size()); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view(ptr_type input, const size_t rows, const size_t cols,
+                   UCL_Device &dev) {
+    #ifdef UCL_DEBUG
+    assert(rows==1);
+    #endif
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _row_bytes=_cols*sizeof(numtyp);
+    this->_cq=dev.cq();
+    _array=input;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+    #ifdef _OCL_MAT
+    _offset=0;
+    #endif
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view(ptr_type input, const size_t rows, const size_t cols,
+                   const size_t stride, UCL_Device &dev) 
+    { view(input,rows,cols,stride); }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view(ptr_type input, const size_t cols, UCL_Device &dev)
+    { view(input,1,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols) {
+    #ifdef UCL_DEBUG
+    assert(rows==1);
+    #endif
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _row_bytes=_cols*sizeof(numtyp);
+    this->_cq=input.cq();
+    #ifdef _OCL_MAT
+    _array=input.begin();
+    _offset=offset;
+    #else
+    _device_view(&_array,input.begin(),offset,sizeof(numtyp));
+    #endif
+    
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols, const size_t stride) 
+    { view_offset(offset,input,rows,cols); }
+
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
+    { view_offset(offset,input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view **/
+  template <class ucl_type>
+  inline void view_offset(const size_t offset, ucl_type &input) 
+    { view_offset(offset,input,input.rows()*input.row_size()-offset); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
+                          const size_t cols, UCL_Device &dev) {
+    #ifdef UCL_DEBUG
+    assert(rows==1);
+    #endif
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _row_bytes=_cols*sizeof(numtyp);
+    this->_cq=dev.cq();
+    
+    #ifdef _OCL_MAT
+    _array=input;
+    _offset=offset;
+    #else
+    #ifdef _UCL_DEVICE_PTR_MAT
+    _array=input+offset*sizeof(numtyp);
+    #else
+    _array=input+offset;
+    #endif
+    #endif
+    
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_cols;
+    #endif
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type input,const size_t rows,
+                          const size_t cols,const size_t stride,UCL_Device &dev) 
+    { view_offset(offset,input,rows,cols,stride); }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container **/
+  template <class ptr_type>
+  inline void view_offset(const size_t offset, ptr_type input, 
+                          const size_t cols, UCL_Device &dev)
+    { view_offset(offset,input,1,cols,dev); }
+  
+  /// Free memory and set size to 0
+  inline void clear() 
+    { if (_kind!=UCL_VIEW) { _cols=0; _kind=UCL_VIEW; _device_free(*this); } }
+
+  /// Set each element to zero
+  inline void zero() { _device_zero(*this,row_bytes()); }
+
+  /// Set first n elements to zero
+  inline void zero(const int n) { _device_zero(*this,n*sizeof(numtyp)); }
+
+  #ifdef _UCL_DEVICE_PTR_MAT
+  /// For OpenCL, returns a (void *) device pointer to memory allocation
+  inline device_ptr & begin() { return _array; }
+  /// For OpenCL, returns a (void *) device pointer to memory allocation
+  inline const device_ptr & begin() const { return _array; }
+  #else
+  /// For CUDA-RT, get device pointer to first element
+  inline numtyp * begin() { return _array; }
+  /// For CUDA-RT, get device pointer to first element
+  inline const numtyp * begin() const { return _array; }
+  /// For CUDA-RT, get device pointer to one past last element
+  inline numtyp * end() { return _end; }
+  /// For CUDA-RT, get device pointer to one past last element
+  inline const numtyp * end() const { return _end; }
+  #endif
+  
+  #ifdef _UCL_DEVICE_PTR_MAT
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns void** **/
+  inline device_ptr & cbegin() { return _array; }
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns void** **/
+  inline const device_ptr & cbegin() const { return _array; }
+  #else
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns numtyp** **/
+  inline numtyp ** cbegin() { return &_array; }
+  /// Returns an API specific device pointer
+  /** - For OpenCL, returns a &cl_mem object
+    * - For CUDA Driver, returns a &CUdeviceptr
+    * - For CUDA-RT, returns numtyp** **/
+  inline const numtyp ** cbegin() const { return &_array; }
+  /// For CUDA-RT, allocate row vector and bind texture
+  inline void safe_alloc(const size_t cols, UCL_Device &dev,
+                         textureReference *t) 
+    { alloc(cols,dev); assign_texture(t); bind(); }
+  /// For CUDA-RT, assign a texture to matrix
+  inline void assign_texture(textureReference *t) { _tex_ptr=t; }  
+  /// For CUDA-RT, bind to texture
+  inline void bind() {
+    cuda_gb_get_channel<numtyp>(_channel);
+    (*_tex_ptr).addressMode[0] = cudaAddressModeClamp;
+    (*_tex_ptr).addressMode[1] = cudaAddressModeClamp;
+    (*_tex_ptr).filterMode = cudaFilterModePoint;
+    (*_tex_ptr).normalized = false;
+    CUDA_SAFE_CALL(cudaBindTexture(NULL,_tex_ptr,_array,&_channel));
+  }
+  /// For CUDA-RT, unbind texture
+  inline void unbind() { CUDA_SAFE_CALL(cudaUnbindTexture(_tex_ptr)); }
+  #endif
+
+  /// Get the number of elements
+  inline size_t numel() const { return _cols; }
+  /// Get the number of rows
+  inline size_t rows() const { return 1; }
+  /// Get the number of columns
+  inline size_t cols() const { return _cols; }
+  ///Get the size of a row (including any padding) in elements
+  inline size_t row_size() const { return _cols; }
+  /// Get the size of a row (including any padding) in bytes
+  inline size_t row_bytes() const { return _row_bytes; }
+  /// Get the size in bytes of 1 element
+  inline int element_size() const { return sizeof(numtyp); }
+  
+  #ifdef _OCL_MAT
+  /// Return the offset (in elements) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t offset() const { return _offset; }
+  #else
+  /// Return the offset (in elements) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t offset() const { return 0; }
+  #endif
+
+  /// Return the offset (in bytes) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t byteoff() const { return offset()*sizeof(numtyp); }
+
+ private:
+  size_t _row_bytes, _row_size, _rows, _cols;
+  enum UCL_MEMOPT _kind;
+  
+  #ifdef _UCL_DEVICE_PTR_MAT
+  device_ptr _array;
+  #else
+  numtyp *_array,*_end;
+  cudaChannelFormatDesc _channel;
+  textureReference *_tex_ptr;
+  #endif
+
+  #ifdef _OCL_MAT
+  size_t _offset;
+  #endif
+};
+
+#endif
+
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@ -0,0 +1,48 @@
+/***************************************************************************
+                              nvc_get_devices.h
+                             -------------------
+                               W. Michael Brown
+
+  List properties of cuda devices
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Wed Jan 28 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifdef UCL_OPENCL
+#include "ocl_device.h"
+using namespace ucl_opencl;
+#endif
+
+#ifdef UCL_CUDADR
+#include "nvd_device.h"
+using namespace ucl_cudadr;
+#endif
+
+#ifdef UCL_CUDART
+#include "nvc_device.h"
+using namespace ucl_cudart;
+#endif
+
+int main(int argc, char** argv) {
+  UCL_Device cop;
+  std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
+  if (cop.num_platforms()>0) {
+    std::cout << "Using platform: " << cop.platform_name() << std::endl;
+    cop.print_all(std::cout);
+  }
+  return 0;
+}
+
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@ -0,0 +1,378 @@
+/***************************************************************************
+                                 ucl_h_mat.h
+                             -------------------
+                               W. Michael Brown
+
+  Matrix Container on Host
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifdef _UCL_MAT_ALLOW
+
+/// Matrix on Host with options for pinning (page locked)
+template <class numtyp>
+class UCL_H_Mat : public UCL_BaseMat {
+ public:
+   // Traits for copying data
+   // MEM_TYPE is 0 for device, 1 for host, and 2 for image
+   enum traits {
+     DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
+     MEM_TYPE = 1,
+     PADDED = 0,
+     ROW_MAJOR = 1,
+     VECTOR = 0
+   };
+   typedef numtyp data_type; 
+   
+  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
+  ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
+  
+  /// Construct with specied number of rows and columns
+  /** \sa alloc() **/
+  UCL_H_Mat(const size_t rows, const size_t cols, UCL_Device &device, 
+            const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) 
+    { _rows=0; _kind=UCL_VIEW; alloc(rows,cols,device,kind); }
+  
+  /// Set up host matrix with specied # of rows/cols and reserve memory
+  /** The kind parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * \param cq Default command queue for operations copied from another mat
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  template <class mat_type>
+  inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
+                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+    clear();
+    _cols=cols;
+    _rows=rows;
+    _row_bytes=cols*sizeof(numtyp);
+    _kind=kind;
+    int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
+                << " bytes on host.\n";
+      exit(1);
+    }
+    #endif 
+    _end=_array+rows*cols;
+    return err;
+  }    
+
+  /// Set up host matrix with specied # of rows/cols and reserve memory
+  /** The kind parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * \param device Used to get the default command queue for operations
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
+                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+    clear();
+    _cols=cols;
+    _rows=rows;
+    _row_bytes=cols*sizeof(numtyp);
+    _kind=kind;
+    int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
+    _end=_array+rows*cols;
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
+                << " bytes on host.\n";
+      exit(1);
+    }
+    #endif
+    return err;
+  }    
+  
+  /// Return the type of memory allocation
+  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
+  inline enum UCL_MEMOPT kind() const { return _kind; }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols,
+                   const size_t stride) {
+    assert(rows==1 || stride==cols);
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _rows=rows;
+    _row_bytes=stride*sizeof(numtyp);
+    this->_cq=input.cq();
+    _array=input.begin();
+    _end=_array+_cols;
+    #ifdef _OCL_MAT
+    _carray=input.cbegin();
+    #endif
+  }
+
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols) 
+    { view(input,rows,cols,input.row_size()); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t cols)
+    { view(input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input) 
+    { view(input,input.rows(),input.cols()); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+                   const size_t stride, UCL_Device &dev) { 
+    assert(rows==1 || stride==cols);
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _rows=rows;
+    _row_bytes=stride*sizeof(numtyp);
+    this->_cq=dev.cq();
+    _array=input;
+    _end=_array+_cols;
+    
+    #ifdef _OCL_MAT
+    _host_alloc(*this,dev,_row_bytes,UCL_VIEW);
+    #endif 
+  }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+                   UCL_Device &dev) { view(input,rows,cols,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
+    { view(input,1,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols, const size_t stride) { 
+    assert(rows==1 || stride==cols);
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _rows=rows;
+    _row_bytes=stride*sizeof(numtyp);
+    this->_cq=input.cq();
+    _array=input.begin()+offset;
+    _end=_array+_cols;
+    #ifdef _OCL_MAT
+    _host_alloc(*this,input,_row_bytes,UCL_VIEW);
+    #endif
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols) 
+    { view_offset(offset,input,rows,cols,input.row_size()); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
+    { view_offset(offset,input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset, ucl_type &input) { 
+    if (input.rows()==1) 
+      view_offset(offset,input,1,input.cols()-offset);
+    else 
+      view_offset(offset,input,input.rows()-offset/input.row_size(),
+                  input.cols());
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
+                          const size_t cols, UCL_Device &dev)
+    { view(input+offset,rows,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
+                          const size_t cols,const size_t stride,UCL_Device &dev) 
+    { view(input+offset,rows,cols,stride,dev); }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset, ptr_type *input, 
+                          const size_t cols, UCL_Device &dev)
+    { view(input+offset,1,cols,dev); }
+  
+  /// Free memory and set size to 0
+  inline void clear() 
+    { if (_kind!=UCL_VIEW) {_rows=0; _kind=UCL_VIEW; _host_free(*this,_kind); }} 
+
+  /// Set each element to zero
+  inline void zero() { _host_zero(_array,_rows*row_bytes()); }
+  /// Set first n elements to zero
+  inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
+
+  /// Get host pointer to first element
+  inline numtyp * begin() { return _array; }
+  /// Get host pointer to first element
+  inline const numtyp * begin() const { return _array; }
+  /// Get host pointer to one past last element
+  inline numtyp * end() { return _end; }
+  /// Get host pointer to one past last element
+  inline const numtyp * end() const { return _end; }
+
+  /// Get the number of elements
+  inline size_t numel() const { return _rows*_cols; }
+  /// Get the number of rows
+  inline size_t rows() const { return _rows; }
+  /// Get the number of columns
+  inline size_t cols() const { return _cols; }
+  ///Get the size of a row (including any padding) in elements
+  inline size_t row_size() const { return _cols; }
+  /// Get the size of a row (including any padding) in bytes
+  inline size_t row_bytes() const { return _row_bytes; }
+  /// Get the size in bytes of 1 element
+  inline int element_size() const { return sizeof(numtyp); }
+    
+  /// Get element at index i
+  inline numtyp & operator[](const int i) { return _array[i]; }
+  /// Get element at index i
+  inline const numtyp & operator[](const int i) const { return _array[i]; }
+  /// 2D access (row should always be 0) 
+  inline numtyp & operator()(const int row, const int col) 
+    { return _array[row*_cols+col]; }
+  /// 2D access (row should always be 0) 
+  inline const numtyp & operator()(const int row, const int col) const
+    { return _array[row*_cols+col]; }
+  
+  /// Returns pointer to memory pointer for allocation on host
+  inline numtyp ** host_ptr() { return &_array; }
+  
+  /// Return the offset (in elements) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t offset() const { return 0; }
+  /// Return the offset (in bytes) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t byteoff() const { return 0; }
+
+  #ifdef _OCL_MAT
+  /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
+  inline device_ptr & cbegin() { return _carray; }
+  /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
+  inline const device_ptr & cbegin() const { return _carray; }
+  #else
+  /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
+  inline void ** cbegin() { return (void **)&_array; }
+  /// Returns an API specific device pointer (cl_mem& for OpenCL, void ** for CUDA)
+  inline const void ** cbegin() const { return (const void **)&_array; }
+  #endif
+  
+ private:
+  enum UCL_MEMOPT _kind;
+  numtyp *_array, *_end;
+  size_t _row_bytes, _rows, _cols;
+
+  #ifdef _OCL_MAT
+  device_ptr _carray;
+  #endif  
+};
+
+#endif
+
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -0,0 +1,370 @@
+/***************************************************************************
+                                 ucl_h_vec.h
+                             -------------------
+                               W. Michael Brown
+
+  Vector Container on Host
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Thu Jun 25 2009
+    copyright            : (C) 2009 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2009) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifdef _UCL_MAT_ALLOW
+
+/// Row Vector on Host with options for pinning (page locked)
+template <class numtyp>
+class UCL_H_Vec : public UCL_BaseMat {
+ public:
+   // Traits for copying data
+   // MEM_TYPE is 0 for device, 1 for host, and 2 for image
+   enum traits {
+     DATA_TYPE = _UCL_DATA_ID<numtyp>::id,
+     MEM_TYPE = 1,
+     PADDED = 0,
+     ROW_MAJOR = 1,
+     VECTOR = 1
+   };
+   typedef numtyp data_type; 
+   
+  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
+  ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
+  
+  /// Construct with n columns
+  /** \sa alloc() **/
+  UCL_H_Vec(const size_t n, UCL_Device &device, 
+            const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) 
+    { _cols=0; _kind=UCL_VIEW; alloc(n,device,kind); }
+  
+  /// Set up host vector with 'cols' columns and reserve memory
+  /** The kind parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * \param cq Default command queue for operations copied from another mat
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  template <class mat_type>
+  inline int alloc(const size_t cols, mat_type &cq,
+                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+    clear();
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    _kind=kind;
+    int err=_host_alloc(*this,cq,_row_bytes,kind);
+    _end=_array+cols;
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on host.\n";
+      exit(1);
+    }
+    #endif 
+    return err;
+  }    
+
+  /// Set up host vector with 'cols' columns and reserve memory
+  /** The kind parameter controls memory pinning as follows:
+    * - UCL_NOT_PINNED      - Memory is not pinned
+    * - UCL_WRITE_OPTIMIZED - Memory can be pinned (write-combined)
+    * - UCL_RW_OPTIMIZED    - Memory can be pinned 
+    * \param device Used to get the default command queue for operations
+    * \return UCL_SUCCESS if the memory allocation is successful **/
+  inline int alloc(const size_t cols, UCL_Device &device,
+                   const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
+    clear();
+    _cols=cols;
+    _row_bytes=cols*sizeof(numtyp);
+    _kind=kind;
+    int err=_host_alloc(*this,device,_row_bytes,kind);
+    _end=_array+cols;
+    #ifndef UCL_NO_EXIT
+    if (err!=UCL_SUCCESS) {
+      std::cerr << "UCL Error: Could not allocate " << _row_bytes
+                << " bytes on host.\n";
+      exit(1);
+    }
+    #endif 
+    return err;
+  }
+  
+  /// Return the type of memory allocation
+  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, or UCL_VIEW **/ 
+  inline enum UCL_MEMOPT kind() const { return _kind; }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols) {
+    #ifdef UCL_DEBUG
+    assert(rows==1);
+    #endif
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _row_bytes=_cols*sizeof(numtyp);
+    this->_cq=input.cq();
+    _array=input.begin();
+    _end=_array+_cols;
+    #ifdef _OCL_MAT
+    _carray=input.cbegin();
+    #endif
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t rows, const size_t cols,
+                   const size_t stride) { view(input,rows,cols); }
+
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input, const size_t cols)
+    { view(input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view(ucl_type &input) 
+    { view(input,input.rows()*input.row_size()); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+                   UCL_Device &dev) {
+    #ifdef UCL_DEBUG
+    assert(rows==1);
+    #endif
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _row_bytes=_cols*sizeof(numtyp);
+    this->_cq=dev.cq();
+    _array=input;
+    _end=_array+_cols;
+    
+    #ifdef _OCL_MAT
+    _host_alloc(*this,dev,_row_bytes,UCL_VIEW);
+    #endif 
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view(ptr_type *input, const size_t rows, const size_t cols,
+                   const size_t stride, UCL_Device &dev) 
+    { view(input,rows,cols,stride); }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view(ptr_type *input, const size_t cols, UCL_Device &dev)
+    { view(input,1,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols) {
+    #ifdef UCL_DEBUG
+    assert(rows==1);
+    #endif
+    clear();
+    _kind=UCL_VIEW;
+    _cols=cols;
+    _row_bytes=_cols*sizeof(numtyp);
+    this->_cq=input.cq();
+    _array=input.begin()+offset;
+    _end=_array+_cols;
+    #ifdef _OCL_MAT
+    _host_alloc(*this,input,_row_bytes,UCL_VIEW);
+    #endif
+  }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device container on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t rows,
+                          const size_t cols, const size_t stride) 
+    { view_offset(offset,input,rows,cols); }
+
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset,ucl_type &input,const size_t cols)
+    { view_offset(offset,input,1,cols); }
+  
+  /// Do not allocate memory, instead use an existing allocation from Geryon
+  /** This function must be passed a Geryon vector or matrix container.
+    * No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - If a matrix is used a input, all elements (including padding)
+    *   will be used for view 
+    * - Viewing a device container on the host is not supported **/ 
+  template <class ucl_type>
+  inline void view_offset(const size_t offset, ucl_type &input) 
+    { view_offset(offset,input,input.rows()*input.row_size()-offset); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
+                          const size_t cols, UCL_Device &dev)
+    { view(input+offset,rows,cols,dev); }
+  
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported 
+    * \param stride Number of _elements_ between the start of each row **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset,ptr_type *input,const size_t rows,
+                          const size_t cols,const size_t stride,UCL_Device &dev) 
+    { view(input+offset,rows,cols,stride,dev); }
+
+  /// Do not allocate memory, instead use an existing allocation
+  /** - No memory is freed when the object is destructed.
+    * - The view does not prevent the memory from being freed by the
+    *   allocating container 
+    * - Viewing a device pointer on the host is not supported **/ 
+  template <class ptr_type>
+  inline void view_offset(const size_t offset, ptr_type *input, 
+                          const size_t cols, UCL_Device &dev)
+    { view(input+offset,1,cols,dev); }
+  
+  /// Free memory and set size to 0
+  inline void clear() 
+    { if (_kind!=UCL_VIEW) {_kind=UCL_VIEW; _cols=0; _host_free(*this,_kind);}}
+
+  /// Set each element to zero
+  inline void zero() { _host_zero(_array,row_bytes()); }
+  
+  /// Set first n elements to zero
+  inline void zero(const int n) { _host_zero(_array,n*sizeof(numtyp)); }
+
+  /// Get host pointer to first element
+  inline numtyp * begin() { return _array; }
+  /// Get host pointer to first element
+  inline const numtyp * begin() const { return _array; }
+  /// Get host pointer to one past last element
+  inline numtyp * end() { return _end; }
+  /// Get host pointer to one past last element
+  inline const numtyp * end() const { return _end; }
+
+  /// Get the number of elements
+  inline size_t numel() const { return _cols; }
+  /// Get the number of rows
+  inline size_t rows() const { return 1; }
+  /// Get the number of columns
+  inline size_t cols() const { return _cols; }
+  ///Get the size of a row (including any padding) in elements
+  inline size_t row_size() const { return _cols; }
+  /// Get the size of a row (including any padding) in bytes
+  inline size_t row_bytes() const { return _row_bytes; }
+  /// Get the size in bytes of 1 element
+  inline int element_size() const { return sizeof(numtyp); }
+    
+  /// Get element at index i
+  inline numtyp & operator[](const int i) { return _array[i]; }
+  /// Get element at index i
+  inline const numtyp & operator[](const int i) const { return _array[i]; }
+  /// 2D access (row should always be 0) 
+  inline numtyp & operator()(const int row, const int col) 
+    { return _array[col]; }
+  /// 2D access (row should always be 0) 
+  inline const numtyp & operator()(const int row, const int col) const
+    { return _array[col]; }
+  
+  /// Returns pointer to memory pointer for allocation on host
+  inline numtyp ** host_ptr() { return &_array; }
+  
+  /// Return the offset (in elements) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t offset() const { return 0; }
+  /// Return the offset (in bytes) from begin() pointer where data starts
+  /** \note Always 0 for host matrices and CUDA APIs **/
+  inline size_t byteoff() const { return 0; }
+  
+  #ifdef _OCL_MAT
+  /// For OpenCL, returns a reference to the cl_mem object
+  inline device_ptr & cbegin() { return _carray; }
+  /// For OpenCL, returns a reference to the cl_mem object
+  inline const device_ptr & cbegin() const { return _carray; }
+  #endif
+  
+ private:
+  enum UCL_MEMOPT _kind;
+  numtyp *_array, *_end;
+  size_t _row_bytes, _cols;
+
+  #ifdef _OCL_MAT
+  device_ptr _carray;
+  #endif
+};
+
+#endif
+
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@ -0,0 +1,42 @@
+/***************************************************************************
+                               ucl_nv_kernel.h
+                             -------------------
+                               W. Michael Brown
+
+  Preprocessor macros for OpenCL/CUDA compatibility
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Mon May 3 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+// Only allow this file to be included by CUDA and OpenCL specific headers
+#ifndef UCL_NV_KERNEL_H
+#define UCL_NV_KERNEL_H
+
+#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
+#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
+#define THREAD_ID_X threadIdx.x
+#define THREAD_ID_Y threadIdx.y
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_SIZE_X blockDim.x
+#define BLOCK_SIZE_Y blockDim.y
+#define __kernel extern "C" __global__
+#define __local __shared__
+#define mul24 __mul24
+#define __global  
+#define __inline static __inline__ __device__ 
+
+#endif
--- a/lib/gpu/geryon/ucl_print.h
+++ b/lib/gpu/geryon/ucl_print.h
@ -0,0 +1,273 @@
+/***************************************************************************
+                                 ucl_print.h
+                             -------------------
+                               W. Michael Brown
+
+  Routines for printing debugging output for matrix/vector data
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Mon Jan 11 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+   
+// Only allow this file to be included by nvc_memory.h and ocl_memory.h
+#ifdef UCL_PRINT_ALLOW
+
+template <int mem> struct _ucl_print;
+template <> struct _ucl_print<1> {
+  template <class mat_type>
+  static inline void p(mat_type &mat, const size_t n, std::ostream &out,
+                       const std::string delim) {
+    for (size_t i=0; i<n-1; i++)
+      out << mat[i] << delim;
+    out << mat[n-1];
+  }
+  template <class mat_type>
+  static inline void p(const mat_type &mat, const size_t n, std::ostream &out,
+                       const std::string delim, UCL_Device &dev) {
+    p(mat,n,out,delim);
+  }
+  template <class mat_type>
+  static inline void p(mat_type &mat, const size_t rows, const size_t cols,
+                       std::ostream &out, const std::string delim, 
+                       const std::string row_delim) {
+    int offset=0;
+    int row_size=cols;
+    if (mat_type::VECTOR==0)
+      row_size=mat.row_size();
+    for (size_t j=0; j<rows; j++) {
+      size_t lend=offset+cols-1;
+      for (size_t i=offset; i<lend; i++)
+        out << mat[i] << delim;
+      out << mat[lend];
+      if (j!=rows-1)
+        out << row_delim;
+      offset+=row_size;
+    }
+  }
+  template <class mat_type>
+  static inline void p(const mat_type &mat,const size_t rows,const size_t cols,
+                       std::ostream &out,const std::string delim, 
+                       const std::string row_delim, UCL_Device &dev) {
+    p(mat,rows,cols,out,delim,row_delim);                       
+  }
+};
+      
+template <int mem> struct _ucl_print {
+  template <class mat_type>
+  static inline void p(mat_type &mat, const size_t n, std::ostream &out,
+                       const std::string delim) {
+    UCL_H_Vec<typename mat_type::data_type> temp;
+    temp.alloc(n,mat);
+    ucl_copy(temp,mat,n,false);
+    _ucl_print<1>::p(temp,n,out,delim);
+  }
+  template <class mat_type>
+  static inline void p(const mat_type &mat, const size_t n, std::ostream &out,
+                       const std::string delim, UCL_Device &dev) {
+    UCL_H_Vec<typename mat_type::data_type> temp;
+    temp.alloc(n,dev);
+    ucl_copy(temp,mat,n,false);
+    _ucl_print<1>::p(temp,n,out,delim);
+  }
+  template <class mat_type>
+  static inline void p(mat_type &mat, const size_t rows, const size_t cols,
+                       std::ostream &out, const std::string delim, 
+                       const std::string row_delim) {
+    UCL_H_Vec<typename mat_type::data_type> temp;
+    temp.alloc(mat.rows()*mat.cols(),mat);
+    if (mat_type::VECTOR==1)
+      ucl_copy(temp,mat,rows*cols,false);
+    else
+      ucl_copy(temp,mat,rows,cols,false);
+    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);      
+  }
+  template <class mat_type>
+  static inline void p(const mat_type &mat, const size_t rows, 
+                       const size_t cols,std::ostream &out,
+                       const std::string delim, 
+                       const std::string row_delim, UCL_Device &dev) {
+    UCL_H_Vec<typename mat_type::data_type> temp;
+    temp.alloc(mat.rows()*mat.cols(),dev);
+    if (mat_type::VECTOR==1)
+      ucl_copy(temp,mat,rows*cols,false);
+    else
+      ucl_copy(temp,mat,rows,cols,false);
+    _ucl_print<1>::p(temp,rows,cols,out,delim,row_delim);      
+  }
+};                   
+
+// -------------------------------------------------------------------------
+// - Non-const routines that do not require a device object
+// -------------------------------------------------------------------------
+
+/// Outputs n elements of mat delimited by the string delim
+template <class mat_type>
+inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out,
+                      const std::string delim) {
+  if (n>mat.numel()) {
+    std::cerr << "Attempted to ucl_print " << n << " elements of matrix "
+              << "that only has " << mat.numel() << " elements.";
+    exit(1);
+  }
+  _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim);
+}
+  
+/// Outputs n elements of mat delimited by a space
+template <class mat_type>
+inline void ucl_print(mat_type &mat, const size_t n, std::ostream &out) {
+  ucl_print(mat,n,out," ");
+}
+  
+/// Outputs n elements of mat delimited by a space to standard out
+template <class mat_type>
+inline void ucl_print(mat_type &mat, const size_t n) {
+  ucl_print(mat,n,std::cout," ");
+}
+
+/// Outputs upper left rows and cols of mat delimited by the string delim
+template <class mat_type>
+inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
+                      std::ostream &out, const std::string delim, 
+                      const std::string row_delim) {                      
+  if (rows*cols>mat.numel()) {
+    std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
+              << "that only has " << mat.numel() << " elements.";
+    exit(1);
+  }
+  _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim);
+}
+  
+/// Outputs upper left rows and cols of mat delimited by a space
+template <class mat_type>
+inline void ucl_print(mat_type &mat, const size_t rows, const size_t cols,
+                      std::ostream &out) {
+  ucl_print(mat,rows,cols,out," ","\n");
+}
+  
+/// Outputs  upper left rows and cols of mat delimited by a space to std out
+template <class mat_type>
+inline void ucl_print(mat_type &mat, const size_t rows, 
+                      const size_t cols) {
+  ucl_print(mat,rows,cols,std::cout," ","\n");
+}
+
+/// Outputs mat delimited by a space to standard out
+template <class mat_type>
+inline void ucl_print(mat_type &mat) {
+  ucl_print(mat,std::cout);
+}
+
+/// Outputs mat delimited by a space
+template <class mat_type>
+inline void ucl_print(mat_type &mat, std::ostream &out) {
+  if (mat_type::VECTOR==1)
+    ucl_print(mat,mat.cols(),out," ");
+  else
+    ucl_print(mat,mat.rows(),mat.cols(),out," ","\n");
+}
+  
+// -------------------------------------------------------------------------
+// - Const routines that do not require a device object
+// -------------------------------------------------------------------------
+
+/// Outputs n elements of mat delimited by the string delim
+template <class mat_type>
+inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
+                      const std::string delim, UCL_Device &dev) {
+  if (n>mat.numel()) {
+    std::cerr << "Attempted to ucl_print " << n << " elements of matrix "
+              << "that only has " << mat.numel() << " elements.";
+    exit(1);
+  }
+  _ucl_print<mat_type::MEM_TYPE>::p(mat,n,out,delim,dev);
+}
+  
+/// Outputs n elements of mat delimited by a space
+template <class mat_type>
+inline void ucl_print(const mat_type &mat, const size_t n, std::ostream &out,
+                      UCL_Device &dev) {
+  ucl_print(mat,n,out," ",dev);
+}
+  
+/// Outputs n elements of mat delimited by a space to standard out
+template <class mat_type>
+inline void ucl_print(const mat_type &mat, const size_t n,
+                      UCL_Device &dev) {
+  ucl_print(mat,n,std::cout," ",dev);
+}
+
+/// Outputs upper left rows and cols of mat delimited by the string delim
+template <class mat_type>
+inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
+                      std::ostream &out, const std::string delim, 
+                      const std::string row_delim, UCL_Device &dev) {
+  if (rows*cols>mat.numel()) {
+    std::cerr << "Attempted to ucl_print " << rows*cols << " elements of matrix "
+              << "that only has " << mat.numel() << " elements.";
+    exit(1);
+  }
+  _ucl_print<mat_type::MEM_TYPE>::p(mat,rows,cols,out,delim,row_delim,dev);
+}
+  
+/// Outputs upper left rows and cols of mat delimited by a space
+template <class mat_type>
+inline void ucl_print(const mat_type &mat,const size_t rows,const size_t cols,
+                      std::ostream &out, UCL_Device &dev) {
+  ucl_print(mat,rows,cols,out," ","\n",dev);
+}
+  
+/// Outputs  upper left rows and cols of mat delimited by a space to std out
+template <class mat_type>
+inline void ucl_print(const mat_type &mat, const size_t rows, 
+                      const size_t cols, UCL_Device &dev) {
+  ucl_print(mat,rows,cols,std::cout," ","\n",dev);
+}
+
+/// Outputs mat delimited by a space to standard out
+template <class mat_type>
+inline void ucl_print(const mat_type &mat, UCL_Device &dev) {
+  ucl_print(mat,std::cout,dev);
+}
+
+/// Outputs mat delimited by a space
+template <class mat_type>
+inline void ucl_print(const mat_type &mat, std::ostream &out, UCL_Device &dev) {
+  if (mat_type::VECTOR==1)
+    ucl_print(mat,mat.cols(),out," ",dev);
+  else
+    ucl_print(mat,mat.rows(),mat.cols(),out," ","\n",dev);
+}
+
+// -------------------------------------------------------------------------
+// - Operator << Overloading
+// -------------------------------------------------------------------------
+
+template <class numtyp>
+inline std::ostream & operator << (std::ostream &out, UCL_H_Vec<numtyp> &mat)
+  { ucl_print(mat,out); return out; } 
+
+template <class numtyp>
+inline std::ostream & operator << (std::ostream &out, UCL_H_Mat<numtyp> &mat)
+  { ucl_print(mat,out); return out; } 
+
+template <class numtyp>
+inline std::ostream & operator << (std::ostream &out, UCL_D_Vec<numtyp> &mat)
+  { ucl_print(mat,out); return out; } 
+
+template <class numtyp>
+inline std::ostream & operator << (std::ostream &out, UCL_D_Mat<numtyp> &mat)
+  { ucl_print(mat,out); return out; } 
+
+#endif
--- a/lib/gpu/geryon/ucl_types.h
+++ b/lib/gpu/geryon/ucl_types.h
@ -0,0 +1,121 @@
+/***************************************************************************
+                                 ucl_types.h
+                             -------------------
+                               W. Michael Brown
+
+  Data type definitions for Coprocessor library
+
+ __________________________________________________________________________
+    This file is part of the Geryon Unified Coprocessor Library (UCL)
+ __________________________________________________________________________
+
+    begin                : Mon Jan 4 2010
+    copyright            : (C) 2010 by W. Michael Brown
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+/* -----------------------------------------------------------------------
+   Copyright (2010) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the Simplified BSD License.
+   ----------------------------------------------------------------------- */
+
+#ifndef UCL_TYPES_H
+#define UCL_TYPES_H
+
+// Assign an integer id based on the data type: (int, float, double, etc)
+template <class eltype> struct _UCL_DATA_ID;
+template <> struct _UCL_DATA_ID<double> { 
+  enum { id=1 };
+  static inline const char * name() { return "double"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=double"; }  
+};
+template <> struct _UCL_DATA_ID<float> { 
+  enum { id=2 };
+  static inline const char * name() { return "float"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=float"; }  
+};
+template <> struct _UCL_DATA_ID<unsigned> { 
+  enum { id=3 };
+  static inline const char * name() { return "unsigned"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned"; }  
+};
+template <> struct _UCL_DATA_ID<int> { 
+  enum { id=4 };
+  static inline const char * name() { return "int"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=int"; }  
+};
+template <> struct _UCL_DATA_ID<char> { 
+  enum { id=5 };
+  static inline const char * name() { return "char"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=char"; }  
+};
+template <> struct _UCL_DATA_ID<unsigned char> { 
+  enum { id=6 };
+  static inline const char * name() { return "unsigned char"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned char"; }  
+};
+template <> struct _UCL_DATA_ID<short> { 
+  enum { id=7 };
+  static inline const char * name() { return "short"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=short"; }  
+};
+template <> struct _UCL_DATA_ID<unsigned short> { 
+  enum { id=8 };
+  static inline const char * name() { return "unsigned short"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned short"; }  
+};
+template <> struct _UCL_DATA_ID<long> { 
+  enum { id=9 };
+  static inline const char * name() { return "long"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=long"; }  
+};
+template <> struct _UCL_DATA_ID<unsigned long> { 
+  enum { id=10 };
+  static inline const char * name() { return "unsigned long"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=unsigned long"; }  
+};
+template <> struct _UCL_DATA_ID<long double> { 
+  enum { id=11 };
+  static inline const char * name() { return "long double"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=long double"; }  
+};
+template <class eltype> struct _UCL_DATA_ID { 
+  enum { id=0 };
+  static inline const char * name() { return "error_type"; }  
+  static inline const char * numtyp_flag() { return "-D NUMTYP=error_type"; }  
+};
+
+// Host memory allocation types
+enum UCL_MEMOPT {
+  UCL_WRITE_ONLY,     ///< Allow any optimizations for memory that is write only
+  UCL_READ_ONLY,      ///< Allow any optimizations for memory that is read only
+  UCL_READ_WRITE,     ///< Allow read and write
+  UCL_WRITE_OPTIMIZED,///< Allow host memory to be pinned (write combined)
+  UCL_RW_OPTIMIZED,   ///< Allow host memory to be pinned  
+  UCL_NOT_PINNED,     ///< Host memory is not to be pinned
+  UCL_VIEW            ///< View of another memory allocation
+};
+
+enum UCL_DEVICE_TYPE { 
+  UCL_DEFAULT,        ///< Unknown device type
+  UCL_CPU,            ///< Device is a CPU
+  UCL_GPU,            ///< Device is a GPU
+  UCL_ACCELERATOR     ///< Device is an Accelerator
+};
+
+enum UCL_ERROR_FLAG {
+  UCL_SUCCESS,            ///< No error
+  UCL_ERROR,              ///< Unqualified error
+  UCL_FILE_NOT_FOUND,     ///< File not found
+  UCL_FUNCTION_NOT_FOUND, ///< Kernel function not found
+  UCL_COMPILE_ERROR,      ///< Error compiling kernel
+  UCL_MEMORY_ERROR
+};  
+
+template <class numtyp>
+const char * ucl_template_name() { return _UCL_DATA_ID<numtyp>::name(); }
+
+#endif
+
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@ -0,0 +1,123 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lj96_cut_gpu_memory.h"
+
+using namespace std;
+
+static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double **offset, double *special_lj, const int inum,
+                   const int nall, const int max_nbors, const int maxspecial,
+                   const double cell_size, int &gpu_mode, FILE *screen) {
+  LJ96MF.clear();
+  gpu_mode=LJ96MF.device->gpu_mode();
+  double gpu_split=LJ96MF.device->particle_split();
+  int first_gpu=LJ96MF.device->first_device();
+  int last_gpu=LJ96MF.device->last_device();
+  int world_me=LJ96MF.device->world_me();
+  int gpu_rank=LJ96MF.device->gpu_rank();
+  int procs_per_gpu=LJ96MF.device->procs_per_gpu();
+
+  LJ96MF.device->init_message(screen,"lj96/cut",first_gpu,last_gpu);
+
+  bool message=false;
+  if (world_me==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  if (world_me==0) {
+    bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                             host_lj4, offset, special_lj, inum, nall, 300,
+                             maxspecial, cell_size, gpu_split, screen);
+    if (!init_ok)
+      return false;
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0) {
+      bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                              host_lj4, offset, special_lj, inum, 
+                              nall, 300, maxspecial, cell_size, gpu_split,
+			      screen);
+      if (!init_ok)
+        return false;
+    }
+    MPI_Barrier(LJ96MF.device->gpu_comm);
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return true;
+}
+
+void lj96_gpu_clear() {
+  LJ96MF.clear();
+}
+
+int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *boxlo, double *boxhi, int *tag, int **nspecial,
+                         int **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         const double cpu_time, bool &success) {
+  return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, cpu_time, success);
+}  
+			
+void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success) {
+  LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double lj96_gpu_bytes() {
+  return LJ96MF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@ -0,0 +1,281 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef LJ96_GPU_KERNEL
+#define LJ96_GPU_KERNEL
+
+#define MAX_SHARED_TYPES 8
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+texture<float4> pos_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global acctyp4 *ans, __global acctyp *engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nall, const int nbor_pitch) {
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X;
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].z) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+	numtyp r3inv = sqrt(r6inv);
+        numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nall, const int nbor_pitch) {
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (ii<4)
+    sp_lj[ii]=sp_lj_in[ii];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[ii]=lj1_in[ii];
+    if (eflag>0)
+      lj3[ii]=lj3_in[ii];
+  }
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  __syncthreads();
+  
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+	numtyp r3inv = sqrt(r6inv);
+        numtyp force = r2inv*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@ -0,0 +1,150 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "lj96_cut_gpu_cl.h"
+#else
+#include "lj96_cut_gpu_ptx.h"
+#endif
+
+#include "lj96_cut_gpu_memory.h"
+#include <cassert>
+#define LJ96_GPU_MemoryT LJ96_GPU_Memory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+LJ96_GPU_MemoryT::LJ96_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJ96_GPU_MemoryT::~LJ96_GPU_Memory() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+bool LJ96_GPU_MemoryT::init(const int ntypes,
+                           double **host_cutsq, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen) {
+  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                    _screen,lj96_cut_gpu_kernel);
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+    lj_types=MAX_SHARED_TYPES;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void LJ96_GPU_MemoryT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJ96_GPU_MemoryT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJ96_GPU_Memory<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+
+  int ainum=this->atom->inum();
+  int anall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->atom->dev_ans.begin(),
+                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->atom->dev_ans.begin(),
+                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch);
+  }
+  this->time_pair.stop();
+}
+
+template class LJ96_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj96_cut_gpu_memory.h
+++ b/lib/gpu/lj96_cut_gpu_memory.h
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef LJ96_GPU_MEMORY_H
+#define LJ96_GPU_MEMORY_H
+
+#include "atomic_gpu_memory.h"
+
+template <class numtyp, class acctyp>
+class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
+ public:
+  LJ96_GPU_Memory();
+  ~LJ96_GPU_Memory();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device **/
+  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
+            double **host_lj2, double **host_lj3, double **host_lj4,
+            double **host_offset, double *host_special_lj,
+            const int nlocal, const int nall, const int max_nbors, 
+            const int maxspecial, const double cell_size, 
+            const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+#endif
+
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lj_cut_gpu_memory.h"
+
+using namespace std;
+
+static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+bool ljl_gpu_init(const int ntypes, double **cutsq,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int inum, const int nall, const int max_nbors, 
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen) {
+  LJLMF.clear();
+  gpu_mode=LJLMF.device->gpu_mode();
+  double gpu_split=LJLMF.device->particle_split();
+  int first_gpu=LJLMF.device->first_device();
+  int last_gpu=LJLMF.device->last_device();
+  int world_me=LJLMF.device->world_me();
+  int gpu_rank=LJLMF.device->gpu_rank();
+  int procs_per_gpu=LJLMF.device->procs_per_gpu();
+
+  LJLMF.device->init_message(screen,"lj/cut",first_gpu,last_gpu);
+
+  bool message=false;
+  if (world_me==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  if (world_me==0) {
+    bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                            host_lj4, offset, special_lj, inum, nall, 300,
+                            maxspecial, cell_size, gpu_split, screen);
+    if (!init_ok)
+      return false;
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0) {
+      bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                              host_lj4, offset, special_lj, inum, nall, 300,
+                              maxspecial, cell_size, gpu_split,
+			      screen);
+      if (!init_ok)
+        return false;
+    }
+    MPI_Barrier(LJLMF.device->gpu_comm);
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return true;
+}
+
+void ljl_gpu_clear() {
+  LJLMF.clear();
+}
+
+int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success) {
+  return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, cpu_time, success);
+}  
+			
+void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success) {
+  LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double ljl_gpu_bytes() {
+  return LJLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@ -0,0 +1,279 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef LJ_GPU_KERNEL
+#define LJ_GPU_KERNEL
+
+#define MAX_SHARED_TYPES 8
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+texture<float4> pos_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+
+#endif
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global acctyp4 *ans, __global acctyp *engv, 
+                          const int eflag, const int vflag, const int inum, 
+                          const int nall, const int nbor_pitch) {
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X;
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].z) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        force*=factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nall, const int nbor_pitch) {
+  // ii indexes the two interacting particles in gi
+  int ii=THREAD_ID_X;
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (ii<4)
+    sp_lj[ii]=sp_lj_in[ii];
+  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[ii]=lj1_in[ii];
+    if (eflag>0)
+      lj3[ii]=lj3_in[ii];
+  }
+  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  __syncthreads();
+  
+  if (ii<inum) {
+  
+    acctyp energy=(numtyp)0;
+    acctyp4 f;
+    f.x=(numtyp)0;
+    f.y=(numtyp)0;
+    f.z=(numtyp)0;
+    acctyp virial[6];
+    for (int i=0; i<6; i++)
+      virial[i]=(numtyp)0;
+  
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+  
+      int j=*nbor;
+      if (j < nall) 
+        factor_lj = (numtyp)1.0;
+      else {
+        factor_lj = sp_lj[j/nall];
+        j %= nall;
+      }
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z) {
+        r2inv=(numtyp)1.0/r2inv;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+
+    // Store answers
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@ -0,0 +1,150 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "lj_cut_gpu_cl.h"
+#else
+#include "lj_cut_gpu_ptx.h"
+#endif
+
+#include "lj_cut_gpu_memory.h"
+#include <cassert>
+#define LJL_GPU_MemoryT LJL_GPU_Memory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+LJL_GPU_MemoryT::LJL_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJL_GPU_MemoryT::~LJL_GPU_Memory() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+bool LJL_GPU_MemoryT::init(const int ntypes, 
+                           double **host_cutsq, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen) {
+  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                    _screen,lj_cut_gpu_kernel);
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
+    lj_types=MAX_SHARED_TYPES;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void LJL_GPU_MemoryT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJL_GPU_MemoryT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJL_GPU_Memory<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+
+  int ainum=this->atom->inum();
+  int anall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->atom->dev_ans.begin(),
+                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->atom->dev_ans.begin(),
+                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch);
+  }
+  this->time_pair.stop();
+}
+
+template class LJL_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj_cut_gpu_memory.h
+++ b/lib/gpu/lj_cut_gpu_memory.h
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef LJL_GPU_MEMORY_H
+#define LJL_GPU_MEMORY_H
+
+#include "atomic_gpu_memory.h"
+
+template <class numtyp, class acctyp>
+class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
+ public:
+  LJL_GPU_Memory();
+  ~LJL_GPU_Memory(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device **/
+  bool init(const int ntypes, double **host_cutsq,
+            double **host_lj1, double **host_lj2, double **host_lj3,
+            double **host_lj4, double **host_offset, double *host_special_lj,
+            const int nlocal, const int nall, const int max_nbors, 
+            const int maxspecial, const double cell_size, 
+            const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+#endif
+
--- a/lib/gpu/lj_gpu.cu
+++ b/lib/gpu/lj_gpu.cu
@ -16,206 +16,270 @@
                         Peng Wang (Nvidia), penwang@nvidia.com
                         Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
-
 #include <iostream>
 #include <cassert>
-#include "nvc_macros.h"
-#include "nvc_timer.h"
-#include "nvc_device.h"
-#include "pair_gpu_texture.h"
-#include "pair_gpu_cell.h"
-#include "lj_gpu_memory.cu"
+#include <string.h>
+#include "cudatimer.h"
+#include "lj_tex.h"
+#include "neigh.h"
+#include "cell.h"
 #include "lj_gpu_kernel.h"

+#ifdef WINDLL
+#define EXTERN extern "C" __declspec(dllexport) 
+#else
+#define EXTERN 
+#endif

+static float h_boxlo[3], h_boxhi[3];
+static float cell_size;
+static float *energy = NULL, *d_energy = NULL;
+static float3 *d_force = NULL, *f_temp = NULL, *v_temp = NULL, *d_virial = NULL;
+static float4 *d_pos = NULL, *temp_pos = NULL;
+static int *d_type = NULL;
+static int ncellx, ncelly, ncellz;

-static LJ_GPU_Memory<PRECISION,ACC_PRECISION> LJMF;
-#define LJMT LJ_GPU_Memory<numtyp,acctyp>
+static neigh_list_gpu d_neigh_list;
+static cell_list_gpu d_cell_list;

-
-
-// ---------------------------------------------------------------------------
-// Convert something to a string
-// ---------------------------------------------------------------------------
-#include <sstream>
-
-template <class t>
-inline string lj_gpu_toa(const t& in) {
-  ostringstream o;
-  o.precision(2);
-  o << in;
-  return o.str();
-}
+#define TIMING(x) 

 // ---------------------------------------------------------------------------
 // Return string with GPU info
 // ---------------------------------------------------------------------------
-EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) {
-  string sname=LJMF.gpu.name(id)+", "+
-              lj_gpu_toa(LJMF.gpu.cores(id))+" cores, "+
-              lj_gpu_toa(LJMF.gpu.gigabytes(id))+" GB, "+
-              lj_gpu_toa(LJMF.gpu.clock_rate(id))+" GHZ";
-  strcpy(name,sname.c_str());
+EXTERN void lj_gpu_name(const int id, const int max_nbors, char * name) 
+{
+  struct cudaDeviceProp prop;
+  CUDA_SAFE_CALL( cudaGetDeviceProperties(&prop, id) );
+#ifdef _WIN32
+  strcpy_s(name, strlen(prop.name)+1, prop.name);
+#else
+  strncpy(name, prop.name, strlen(prop.name)+1);
+#endif
 }

-static bool _pc_cell_alloc;
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, 
+			double **cutsq,double **sigma, 
+			 double **epsilon, double **host_lj1, double **host_lj2, 
+			 double **host_lj3, double **host_lj4, double **offset, 
+			 double *special_lj, double *boxlo, double *boxhi, 
+			 double cellsize, double skin,
+			 const int max_nbors, const int gpu_id) 
+{
+  int num_devices;

-inline void _lj_gpu_clear() {
-  if (_pc_cell_alloc) {
-    free(energy);
-    free(v_temp);
-    cudaFreeHost(f_temp);
-    cudaFree(d_force);
-    cudaFree(d_energy);
-    cudaFree(d_virial);
-    clear_cell_list(cell_list_gpu);
-    _pc_cell_alloc=false;
+  /* get device count */
+  CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
+  if (num_devices == 0) {
+    printf("NO CUDA-capable GPU detected.\n");
+    exit(1);
  }
+
+  if (gpu_id > num_devices) {
+    printf("gpu_id %d is larger than the number of GPUs %d\n", 
+	   gpu_id, num_devices);
+    exit(1);
+  }
+
+  /* set CUDA device to the specified GPU */
+  cudaThreadExit();
+  CUDA_SAFE_CALL( cudaSetDevice(gpu_id) );
+  
+  ij_size=0;
+
+  cell_size = cellsize;
+  ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
+  ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
+  ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
+   
+  for (int i = 0; i < 3; i++) {
+    h_boxhi[i] = boxhi[i];
+    h_boxlo[i] = boxlo[i];
+  }
+
+  init_force_const(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset);
+
+  init_cell_list_const(cellsize, skin, boxlo, boxhi);
+
+  return true;
 }

 // ---------------------------------------------------------------------------
 // Clear memory on host and device
 // ---------------------------------------------------------------------------
 EXTERN void lj_gpu_clear() {
-  _lj_gpu_clear();
-  LJMF.clear();
+
+  free(energy);
+  free(v_temp);
+  CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
+  if (d_force) CUDA_SAFE_CALL( cudaFree(d_force) );
+  if (d_energy) CUDA_SAFE_CALL( cudaFree(d_energy) );
+  if (d_virial) CUDA_SAFE_CALL( cudaFree(d_virial) );
+  if (d_pos) CUDA_SAFE_CALL( cudaFree(d_pos) );
+  if (d_type) CUDA_SAFE_CALL( cudaFree(d_type) );
+  if (temp_pos) CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
+  clear_neigh_list_gpu(d_neigh_list);
+  clear_cell_list_gpu(d_cell_list);
+
+  if (useCache) {
+    unbind_pos();
+    unbind_type();
+  }
+
+
+  //LJMF.clear();
 }

-// ---------------------------------------------------------------------------
-// Allocate memory on host and device and copy constants to device
-// ---------------------------------------------------------------------------
-EXTERN bool lj_gpu_init(int &ij_size, const int ntypes, double **cutsq,double **sigma, 
-			 double **epsilon, double **host_lj1, double **host_lj2, 
-			 double **host_lj3, double **host_lj4, double **offset, 
-			 double *special_lj, double *boxlo, double *boxhi, 
-			 double cell_size, double skin,
-			 const int max_nbors, const int gpu_id) {
-  if (LJMF.is_allocated())
-    lj_gpu_clear();
-  else
-    _pc_cell_alloc=false;
-
-  LJMF.gpu.init();
-  if (LJMF.gpu.num_devices()==0)
-    return false;                   
-
-  ij_size=IJ_SIZE;
-
-  bool ret = LJMF.init(ij_size, ntypes, cutsq, sigma, epsilon, host_lj1, host_lj2, 
-		       host_lj3, host_lj4, offset, special_lj, max_nbors, gpu_id,
-                       0,0);
-
-  ncellx = ceil(((boxhi[0] - boxlo[0]) + 2.0*cell_size) / cell_size);
-  ncelly = ceil(((boxhi[1] - boxlo[1]) + 2.0*cell_size) / cell_size);
-  ncellz = ceil(((boxhi[2] - boxlo[2]) + 2.0*cell_size) / cell_size);
-   
-  init_cell_list_const(cell_size, skin, boxlo, boxhi);
-
-  return ret;
-}

 template <class numtyp, class acctyp>
-double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
-		    double **host_x, int *host_type, const int inum, 
-		    const int nall, const int ago, const bool eflag, const bool vflag, 
-		    const double *boxlo, const double *boxhi)
+double _lj_gpu_neigh(double **force, double *virial,
+		     double **host_x, int *host_type, const int inum, 
+		     const int nall, const int ago, const bool eflag, const bool vflag, 
+		     const double *boxlo, const double *boxhi)
 {
-  cudaError_t err;
-  
-  ljm.atom.nall(nall);
-  ljm.atom.inum(inum);
-
-  ljm.nbor.time_nbor.start();
-  ljm.nbor.time_nbor.stop();

  double evdwl=0.0;

-  static int blockSize = BLOCK_1D;
-  static int ncell = ncellx*ncelly*ncellz;
-
  static int first_call = 1;
+  
+  TIMING( static CUDATimer cuTimer );  
+  TIMING( static CTimer cTimer );
+  TIMING( static CTimer cTimer2 );
+  
+  double *atom_pos = host_x[0];
+
+  static int szTailList = inum*32;
+  
+  TIMING( cTimer.Start() );
+  TIMING( cTimer2.Start() );
+   
+  /* MPI communication just happened, reallocate space using new inum & nall
+     FIXME: this is costly: ~ total kernel time! Use a DIY GPU memory allocator.*/

  if (first_call || ago == 0) {
-    first_call = 0;
-    _lj_gpu_clear();
+
+    if (!first_call) {
+      if (useCache) {
+	unbind_pos();
+	unbind_type();
+      }
+      
+      CUDA_SAFE_CALL( cudaFree(d_force) );
+      CUDA_SAFE_CALL( cudaFree(d_energy) );
+      CUDA_SAFE_CALL( cudaFree(d_virial) );
+      CUDA_SAFE_CALL( cudaFree(d_pos) );
+      CUDA_SAFE_CALL( cudaFree(d_type) );
+
+      clear_neigh_list_gpu(d_neigh_list);
+
+      CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
+      CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
+
+      free(energy);
+      free(v_temp);
+    }
+
+    CUDA_SAFE_CALL( cudaMalloc((void**)&d_force,     inum*sizeof(float3)) );
+    CUDA_SAFE_CALL( cudaMalloc((void**)&d_energy,    inum*sizeof(float)) );
+    CUDA_SAFE_CALL( cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3)) );
+    CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) );
+    CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) );
    
+    init_neigh_list_gpu(d_neigh_list, inum, NEIGH_BIN_SIZE, szTailList);
+
+    CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) );
+    CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp,   inum*sizeof(float3)) );
+
    energy    = (float*) malloc(inum*sizeof(float));
    v_temp    = (float3*)malloc(inum*2*sizeof(float3));
-    cudaMallocHost((void**)&f_temp,   inum*sizeof(float3));

-    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
-    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
-    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
+    if (useCache) {
+      bind_pos(d_pos, nall);
+      bind_type(d_type, nall);
+    }
+
+    first_call = 0;
+    CUDA_SAFE_CALL( cudaThreadSynchronize() );
+    CUDA_SAFE_CALL( cudaGetLastError() );
+    CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int), 
+			       cudaMemcpyHostToDevice) );

-    init_cell_list(cell_list_gpu, nall, ncell, blockSize);
-    _pc_cell_alloc=true;
  }

-  // build cell-list on GPU
-  ljm.atom.time_atom.start();
-  build_cell_list(host_x[0], host_type, cell_list_gpu, 
-		  ncell, ncellx, ncelly, ncellz, blockSize, inum, nall, ago);
-  ljm.atom.time_atom.stop();
+  TIMING( static double mallocTime = 0. );
+  TIMING( mallocTime += cTimer2.GetET() );
+  TIMING( printf("malloc time = %f ms\n", mallocTime*1e3) );

-  ljm.time_pair.start();
+  TIMING( cTimer2.Start() );
+  for (int i = 0; i < 3*nall; i+=3) { 
+    temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f);
+  }

-#ifdef TIMING
-  cudaEvent_t start, stop;
-  cudaEventCreate(&start);
-  cudaEventCreate(&stop);
-  cudaEventRecord(start, 0);
-#endif
+  TIMING( static double copyTime = 0. );
+  TIMING( copyTime += cTimer2.GetET() );
+  TIMING( printf("position copy time = %f ms\n", copyTime*1e3) );

-#define KERNEL_LJ_CELL(e, v, b, s)     kernel_lj_cell<e,v,b><<<GX, BX, s>>> \
-                                       (d_force, d_energy, d_virial, \
-					cell_list_gpu.pos, \
-					cell_list_gpu.idx, \
-					cell_list_gpu.type, \
-					cell_list_gpu.natom, \
-					inum, nall, ncell, ncellx, ncelly, ncellz); 
-
-  // call the cell-list force kernel
-  const int BX=blockSize;
-  dim3 GX(ncellx, ncelly*ncellz);
  
-  if (eflag == 0 && vflag == 0) {
-    if (blockSize == 64 ) KERNEL_LJ_CELL(false, false, 64,  0);
-    if (blockSize == 128) KERNEL_LJ_CELL(false, false, 128, 0);
-    if (blockSize == 256) KERNEL_LJ_CELL(false, false, 256, 0);    
-  } else {
-    if (blockSize == 64)  KERNEL_LJ_CELL(true, true, 64,  3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
-    if (blockSize == 128) KERNEL_LJ_CELL(true, true, 128, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
-    if (blockSize == 256) KERNEL_LJ_CELL(true, true, 256, 3*sizeof(float)*MAX_SHARED_TYPES*MAX_SHARED_TYPES);
+  TIMING( cTimer2.Start() );
+  CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), cudaMemcpyHostToDevice) );
+
+  TIMING( static double h2dTime = 0. );
+  TIMING( h2dTime += cTimer2.GetET() );
+  TIMING( printf("h2d copy time = %f ms\n", h2dTime*1e3) );
+
+  TIMING( cTimer2.Start() );
+  if (ago == 0) {
+    build_neigh_list_gpu(d_pos,
+			 d_neigh_list,
+			 h_boxlo, h_boxhi, cell_size,
+			 inum, nall);
  }
-  
-  err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("LJ force kernel launch error: %d\n", err);
-    exit(1);
+  TIMING( static double neighTime = 0. );
+  TIMING( neighTime += cTimer2.GetET() );
+  TIMING( printf("Neigh List time = %f ms\n", neighTime*1e3) );
+
+  TIMING( cTimer2.Start() );
+  calc_lj_neigh_gpu(d_force, d_energy, d_virial,
+		    d_pos, d_type,
+		    d_neigh_list,
+		    inum, nall,
+		    eflag, vflag);
+  TIMING( static double forceTime = 0. );
+  TIMING( forceTime += cTimer2.GetET() );
+  TIMING( printf("Force time = %f ms\n", forceTime*1e3) );
+  TIMING( printf("GPU kernel time = %f ms\n", (forceTime + neighTime)*1e3) );
+
+
+  TIMING( cTimer2.Start() );
+  CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost) );
+  TIMING( static double d2hTime = 0. );
+  TIMING( d2hTime += cTimer2.GetET() );
+  TIMING( printf("d2h copy time = %f ms\n", d2hTime*1e3) );
+  TIMING( printf("GPU-CPU data transfer time = %f ms\n", (h2dTime+d2hTime)*1e3) );
+
+  TIMING( cTimer2.Start() );
+
+  for (int i = 0; i < inum; i++) {
+    force[i][0] += f_temp[i].x;
+    force[i][1] += f_temp[i].y;
+    force[i][2] += f_temp[i].z;
  }

-#ifdef TIMING
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-  float kTime;
-  cudaEventElapsedTime(&kTime, start, stop);
-  kernelTime += kTime;
-  printf("kernelTime = %f, eflag=%d, vflag=%d\n", kTime, eflag, vflag);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-#endif
-
-  // copy results from GPU to CPU
-  cudaMemcpy(f_temp, d_force, inum*sizeof(float3), cudaMemcpyDeviceToHost);
  if (eflag) {
-    cudaMemcpy(energy, d_energy, inum*sizeof(float), cudaMemcpyDeviceToHost);
+    CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy, 
+			       inum*sizeof(float), cudaMemcpyDeviceToHost) );
    for (int i = 0; i < inum; i++) {
      evdwl += energy[i];
    }
    evdwl *= 0.5f;
  }
+  
  if (vflag) {
-    cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), cudaMemcpyDeviceToHost);
+    CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), 
+			       cudaMemcpyDeviceToHost) ); 
    for (int i = 0; i < inum; i++) {
      virial[0] += v_temp[2*i].x;
      virial[1] += v_temp[2*i].y;
@ -228,43 +292,175 @@ double _lj_gpu_cell(LJMT &ljm, double **force, double *virial,
      virial[i] *= 0.5f;
  }

-  for (int i = 0; i < inum; i++) {
-    force[i][0] += f_temp[i].x;
-    force[i][1] += f_temp[i].y;
-    force[i][2] += f_temp[i].z;
-  }

-  ljm.time_pair.stop();
-
-  ljm.atom.time_atom.add_to_total();
-  ljm.nbor.time_nbor.add_to_total();
-  ljm.time_pair.add_to_total();
+  TIMING( static double postTime = 0. );
+  TIMING( postTime += cTimer2.GetET() );
+  TIMING( printf("postprocess Time = %f ms\n", postTime*1e3) );
+  TIMING( printf("Data process time = %f ms\n", (postTime+copyTime)*1e3) );

+  TIMING( static double totalTime = 0. );
+  TIMING( totalTime += cTimer.GetET() );
+  TIMING( printf("lj_gpu time = %f ms\n", totalTime*1e3) );

  return evdwl;
 
 }

-EXTERN double lj_gpu_cell(double **force, double *virial, double **host_x, int *host_type, const int inum, const int nall, 
-		   const int ago, const bool eflag, const bool vflag, 
-		   const double *boxlo, const double *boxhi) 
+EXTERN double lj_gpu_neigh(double **force, double *virial, 
+			  double **host_x, int *host_type, 
+			  const int inum, const int nall, 
+			  const int ago, const bool eflag, const bool vflag, 
+			  const double *boxlo, const double *boxhi) 
 {
-  return _lj_gpu_cell<PRECISION,ACC_PRECISION>(LJMF, force, virial, host_x, host_type, inum, nall, 
-					       ago, eflag, vflag, boxlo, boxhi);
+  return _lj_gpu_neigh<float,float>(force, virial, 
+				    host_x, host_type, inum, nall, 
+				    ago, eflag, vflag, boxlo, boxhi);
+}
+
+
+template <class numtyp, class acctyp>
+double _lj_gpu_cell(double **force, double *virial,
+		    double **host_x, int *host_type, const int inum, 
+		    const int nall, const int ago, 
+		    const bool eflag, const bool vflag, 
+		    const double *boxlo, const double *boxhi)
+{
+  
+  double evdwl=0.0;
+
+  static int ncell = ncellx*ncelly*ncellz;
+
+  static int first_call = 1;
+
+  // allocate memory on CPU and GPU
+  if (first_call || ago == 0) {
+    if (!first_call) {
+     if (useCache) {
+	unbind_pos();
+	unbind_type();
+      }
+
+      free(energy);
+      free(v_temp);
+      
+      CUDA_SAFE_CALL( cudaFree(d_force) );
+      CUDA_SAFE_CALL( cudaFree(d_energy) );
+      CUDA_SAFE_CALL( cudaFree(d_virial) );
+
+      CUDA_SAFE_CALL( cudaFree(d_pos) );
+      CUDA_SAFE_CALL( cudaFree(d_type) );
+      CUDA_SAFE_CALL( cudaFreeHost(f_temp) );
+      CUDA_SAFE_CALL( cudaFreeHost(temp_pos) );
+
+      clear_cell_list_gpu(d_cell_list);
+    }
+
+    energy    = (float*) malloc(inum*sizeof(float));
+    v_temp    = (float3*)malloc(inum*2*sizeof(float3));
+
+
+    cudaMalloc((void**)&d_force,     inum*sizeof(float3));
+    cudaMalloc((void**)&d_energy,    inum*sizeof(float));
+    cudaMalloc((void**)&d_virial,    inum*3*sizeof(float3));
+
+    CUDA_SAFE_CALL( cudaMalloc((void**)&d_pos, nall*sizeof(float4)) );
+    CUDA_SAFE_CALL( cudaMalloc((void**)&d_type, nall*sizeof(int)) );
+
+    CUDA_SAFE_CALL( cudaMallocHost((void**)&f_temp,   inum*sizeof(float3)) );
+    CUDA_SAFE_CALL( cudaMallocHost((void**)&temp_pos, nall*sizeof(float4)) );
+
+    init_cell_list_gpu(d_cell_list, nall, ncell);
+
+    CUDA_SAFE_CALL( cudaMemcpy(d_type, host_type, nall*sizeof(int), 
+			       cudaMemcpyHostToDevice) );
+
+    if (useCache) {
+      bind_pos(d_pos, nall);
+      bind_type(d_type, nall);
+    }
+
+    first_call = 0;
+  }
+
+  /* build cell-list on GPU */
+  double *atom_pos = host_x[0];
+  for (int i = 0; i < 3*nall; i+=3) { 
+    temp_pos[i/3] = make_float4(atom_pos[i], atom_pos[i+1], atom_pos[i+2], 0.f);
+  }
+  CUDA_SAFE_CALL( cudaMemcpy(d_pos, temp_pos, nall*sizeof(float4), 
+			     cudaMemcpyHostToDevice) );
+  if (ago == 0) {
+    build_cell_list_gpu(d_pos, d_cell_list, h_boxlo, h_boxhi, 
+			cell_size, inum, nall);
+  }
+
+  calc_lj_cell_gpu(d_force, d_energy, d_virial,
+		   d_pos, d_type, d_cell_list,
+		   inum, nall, ncellx, 
+		   ncelly, ncellz, cell_size,
+		   eflag, vflag);
+
+  CUDA_SAFE_CALL( cudaMemcpy(f_temp, d_force, inum*sizeof(float3), 
+			     cudaMemcpyDeviceToHost) );
+
+  for (int i = 0; i < inum; i++) {
+    force[i][0] += f_temp[i].x;
+    force[i][1] += f_temp[i].y;
+    force[i][2] += f_temp[i].z;
+  }
+  
+  if (eflag) {
+    CUDA_SAFE_CALL( cudaMemcpy(energy, d_energy, 
+			       inum*sizeof(float), cudaMemcpyDeviceToHost) );
+    for (int i = 0; i < inum; i++) {
+      evdwl += energy[i];
+    }
+    evdwl *= 0.5f;
+  }
+  
+  if (vflag) {
+    CUDA_SAFE_CALL( cudaMemcpy(v_temp, d_virial, inum*2*sizeof(float3), 
+			       cudaMemcpyDeviceToHost) ); 
+    for (int i = 0; i < inum; i++) {
+      virial[0] += v_temp[2*i].x;
+      virial[1] += v_temp[2*i].y;
+      virial[2] += v_temp[2*i].z;
+      virial[3] += v_temp[2*i+1].x;
+      virial[4] += v_temp[2*i+1].y;
+      virial[5] += v_temp[2*i+1].z;
+    }
+    for (int i = 0; i < 6; i++) 
+      virial[i] *= 0.5f;
+  }
+
+  return evdwl; 
+}
+
+EXTERN double lj_gpu_cell(double **force, double *virial, 
+			  double **host_x, int *host_type, 
+			  const int inum, const int nall, 
+			  const int ago, const bool eflag, const bool vflag, 
+			  const double *boxlo, const double *boxhi) 
+{
+  return _lj_gpu_cell<float,float>(force, virial, 
+				   host_x, host_type, inum, nall, 
+				   ago, eflag, vflag, boxlo, boxhi);
 }

 EXTERN void lj_gpu_time() {
-  cout.precision(4);
-  cout << "Atom copy:     " << LJMF.atom.time_atom.total_seconds() << " s.\n";
-  cout << "Neighbor copy: " << LJMF.nbor.time_nbor.total_seconds() << " s.\n";
-  cout << "LJ calc:       " << LJMF.time_pair.total_seconds() << " s.\n";
-  cout << "Answer copy:   " << LJMF.atom.time_answer.total_seconds() << " s.\n";
+  /*  cout.precision(4);
+  cout << "Atom copy:     " << LJMF.time_atom.total_seconds() << " s.\n";
+  cout << "Neighbor copy: " << LJMF.time_nbor.total_seconds() << " s.\n";
+  cout << "LJ calc:       " << LJMF.time_pair.total_seconds() << " s.\n";*/
+  //cout << "Answer copy:   " << LJMF.time_answer.total_seconds() << " s.\n";
 }

 EXTERN int lj_gpu_num_devices() {
-  return LJMF.gpu.num_devices();
+  int num_devices;
+  CUDA_SAFE_CALL( cudaGetDeviceCount(&num_devices) );
+  return num_devices;
 }

 EXTERN double lj_gpu_bytes() {
-  return LJMF.host_memory_usage();
+  return 0.0;
 }
--- a/lib/gpu/lj_gpu_kernel.h
+++ b/lib/gpu/lj_gpu_kernel.h
@ -1,220 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
-
-#ifndef LJ_GPU_KERNEL
-#define LJ_GPU_KERNEL
-
-/* Cell list version of LJ kernel */
-template<bool eflag, bool vflag, int blockSize>
-__global__ void kernel_lj_cell(float3 *force3,
-			       float *energy, float3 *virial, 
-			       float3 *cell_list, unsigned int *cell_idx, 
-			       int *cell_type, int *cell_atom,
-			       const int inum, const int nall, const int ncell, 
-			       const int ncellx, const int ncelly, const int ncellz)
-{
-	
-  
-	
-  // calculate 3D block idx from 2d block
-  int bx = blockIdx.x;
-  int by = blockIdx.y % ncelly;
-  int bz = blockIdx.y / ncelly;
-
-  int tid = threadIdx.x;
-  
-  // compute cell idx from 3D block idx
-  int cid = bx + INT_MUL(by, ncellx) + INT_MUL(bz, INT_MUL(ncellx,ncelly));
-  
-  __shared__ int typeSh[blockSize];
-  __shared__ float posSh[blockSize*3];
-  __shared__ float cutsqSh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ float lj1Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __shared__ float lj2Sh[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-
-  extern __shared__ float smem[];
-
-  __shared__ float *lj3Sh;
-  __shared__ float *lj4Sh;
-  __shared__ float *offsetSh;
-
-  // load force parameters into shared memory
-  for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
-    int itype = i/MAX_SHARED_TYPES;
-    int jtype = i%MAX_SHARED_TYPES;
-    cutsqSh[i] = _cutsq_<float>(itype,jtype);
-    lj1Sh[i]   = _lj1_<float>(itype,jtype).x;
-    lj2Sh[i]   = _lj1_<float>(itype,jtype).y;
-  }
-
-  // Only allocate shared memory when needed, 
-  // this reduces shared memory limitation on occupancy
-  if (eflag || vflag) {
-    lj3Sh = smem;
-    lj4Sh = lj3Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
-    offsetSh = lj4Sh + MAX_SHARED_TYPES*MAX_SHARED_TYPES;
-    for (int i = tid; i < MAX_SHARED_TYPES*MAX_SHARED_TYPES; i += blockSize) {
-      int itype = i/MAX_SHARED_TYPES;
-      int jtype = i%MAX_SHARED_TYPES;
-      lj3Sh[i]   = _lj3_<float>(itype,jtype).x+0.01;
-      lj4Sh[i]   = _lj3_<float>(itype,jtype).y;
-      offsetSh[i]= _offset_<float>(itype,jtype);
-    }
-  }
-
-  __syncthreads();
-
-  int nborz0 = max(bz-1,0), nborz1 = min(bz+1, ncellz-1),
-      nbory0 = max(by-1,0), nbory1 = min(by+1, ncelly-1),
-      nborx0 = max(bx-1,0), nborx1 = min(bx+1, ncellx-1);
-
-  for (int ii = 0; ii < ceil((float)(cell_atom[cid])/blockSize); ii++) {
-    float3 f = {0.0f, 0.0f, 0.0f};
-    float ener = 0.0f;
-    float3 v0 = {0.0f, 0.0f, 0.0f}, v1 = {0.0f, 0.0f, 0.0f};
-    int itype;
-    float ix, iy, iz;
-    int i = tid + ii*blockSize;
-    unsigned int answer_pos = cell_idx[cid*blockSize+i];
-
-    // load current cell atom position and type into sMem
-    for (int j = tid; j < cell_atom[cid]; j += blockSize) {
-      int pid = cid*blockSize + j;
-      float3 pos = cell_list[pid];
-      posSh[j            ] = pos.x;
-      posSh[j+  blockSize] = pos.y;
-      posSh[j+2*blockSize] = pos.z;
-      typeSh[j]            = cell_type[pid];
-    }
-    __syncthreads();
-    if (answer_pos < inum) {
-      itype = typeSh[i];
-      ix = posSh[i            ];
-      iy = posSh[i+  blockSize];
-      iz = posSh[i+2*blockSize];
-
-      // compute force from current cell
-      for (int j = 0; j < cell_atom[cid]; j++) {
-	if (j == i) continue;
-	float delx = ix - posSh[j            ];
-	float dely = iy - posSh[j+  blockSize];
-	float delz = iz - posSh[j+2*blockSize];
-	int jtype = typeSh[j];
-	int mtype = itype + jtype*MAX_SHARED_TYPES;
-	float r2inv = delx*delx + dely*dely + delz*delz;
-	
-	if (r2inv < cutsqSh[mtype]) {
-	  r2inv = 1.0f/r2inv;
-	  float r6inv = r2inv * r2inv * r2inv;
-	  float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
-	  f.x += delx * force;
-	  f.y += dely * force;
-	  f.z += delz * force;
-
-	  if (eflag) {
-	    float e = r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);
-	    ener += (e - offsetSh[mtype]); 
-	  }
-	  
-	  if (vflag) {
-	    v0.x += delx*delx*force;
-	    v0.y += dely*dely*force;
-	    v0.z += delz*delz*force;
-	    v1.x += delx*dely*force;
-	    v1.y += delx*delz*force;
-	    v1.z += dely*delz*force;
-	  }
-
-	} 
-      }
-    }
-    __syncthreads();
-
-    // compute force from neigboring cells
-    for (int nborz = nborz0; nborz <= nborz1; nborz++) {
-      for (int nbory = nbory0; nbory <= nbory1; nbory++) {
-	for (int nborx = nborx0; nborx <= nborx1; nborx++) {
-	  if (nborz == bz && nbory == by && nborx == bx) continue;
-	  
-	  // compute cell id
-	  int cid_nbor = nborx + INT_MUL(nbory,ncellx) + 
-	    INT_MUL(nborz,INT_MUL(ncellx,ncelly));
-	
-	  // load neighbor cell position and type into smem
-	  for (int j = tid; j < cell_atom[cid_nbor]; j += blockSize) {
-	    int pid = INT_MUL(cid_nbor,blockSize) + j;
-	    float3 pos = cell_list[pid];
-	    posSh[j            ] = pos.x;
-	    posSh[j+  blockSize] = pos.y;
-	    posSh[j+2*blockSize] = pos.z;
-	    typeSh[j]           = cell_type[pid];
-	  }
-	  __syncthreads();
-	  // compute force
-	  if (answer_pos < inum) {
-	    for (int j = 0; j < cell_atom[cid_nbor]; j++) {
-	      float delx = ix - posSh[j           ];
-	      float dely = iy - posSh[j+  blockSize];
-	      float delz = iz - posSh[j+2*blockSize];
-	      int jtype = typeSh[j];
-	      int mtype = itype + jtype*MAX_SHARED_TYPES;
-	      float r2inv = delx*delx + dely*dely + delz*delz;
-	      
-	      if (r2inv < cutsqSh[mtype]) {
-		r2inv = 1.0f/r2inv;
-		float r6inv = r2inv * r2inv * r2inv;
-		float force = r2inv*r6inv*(lj1Sh[mtype]*r6inv - lj2Sh[mtype]);
-		f.x += delx * force;
-		f.y += dely * force;
-		f.z += delz * force;
-
-		if (eflag) {
-		  float e=r6inv*(lj3Sh[mtype]*r6inv - lj4Sh[mtype]);				
-		  ener += (e-offsetSh[mtype]); 
-		}
-		if (vflag) {
-		  v0.x += delx*delx*force;
-		  v0.y += dely*dely*force;
-		  v0.z += delz*delz*force;
-		  v1.x += delx*dely*force;
-		  v1.y += delx*delz*force;
-		  v1.z += dely*delz*force;
-		}
-	      }
-	    }
-	  }
-	  __syncthreads();
-	}
-      }
-    }
-
-    if (answer_pos < inum) {
-      force3[answer_pos] = f;
-      if (eflag)
-	energy[answer_pos] = ener;
-      if (vflag) {
-	virial[2*answer_pos] = v0;
-	virial[2*answer_pos+1] = v1;
-      }
-    }
-  }
-
-}
-
-#endif
--- a/lib/gpu/lj_gpu_memory.cu
+++ b/lib/gpu/lj_gpu_memory.cu
@ -1,147 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
-
-#include "lj_gpu_memory.h"
-#define LJ_GPU_MemoryT LJ_GPU_Memory<numtyp, acctyp>
-
-template <class numtyp, class acctyp>
-int LJ_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return atom.bytes_per_atom()+nbor.bytes_per_atom(max_nbors); 
-}
-
-template <class numtyp, class acctyp>
-bool LJ_GPU_MemoryT::init(const int ij_size, const int ntypes, 
-                          double **host_cutsq, double **host_sigma, 
-                          double **host_epsilon, double **host_lj1, 
-                          double **host_lj2, double **host_lj3, 
-                          double **host_lj4, double **host_offset, 
-                          double *host_special_lj, const int max_nbors, 
-                          const int me, const int nlocal, const int nall) {
-  if (allocated)
-    clear();
-    
-  if (me>=gpu.num_devices())
-    return false;
-  gpu.set(me);
-  if (gpu.revision()<1.0)
-    return false;  
-    
-  // Initialize timers for the selected GPU
-  time_pair.init();
-
-  // Initialize atom and nbor data
-  max_local=static_cast<int>(static_cast<double>(nlocal)*1.10);
-  if (max_local==0)
-    max_local=1000;
-  if (nall<=nlocal)
-    max_atoms=max_local*2;
-  else
-    max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-  
-  if (!atom.init(max_atoms))
-    return false;
-  if (!nbor.init(ij_size,max_local,max_nbors))
-    return false;
-  
-  // Get a stream for computing pair potentials
-  CUDA_SAFE_CALL(cudaStreamCreate(&pair_stream));
-    
-  // Use the write buffer from atom for data initialization
-  NVC_HostT &host_write=atom.host_write;
-  assert(host_write.numel()>4 && host_write.numel()>ntypes*ntypes*2);
-
-  // Copy data for bonded interactions
-  special_lj.safe_alloc(4);
-  special_lj.cast_copy(host_special_lj,host_write);
-
-  // Copy sigma, epsilon, and cutsq onto GPU
-  sigma.safe_alloc(ntypes,ntypes,sigma_get_texture<numtyp>());
-  sigma.cast_copy(host_sigma[0],host_write);
-  epsilon.safe_alloc(ntypes,ntypes,epsilon_get_texture<numtyp>());
-  epsilon.cast_copy(host_epsilon[0],host_write);
-  cutsq.safe_alloc(ntypes,ntypes,cutsq_get_texture<numtyp>());
-  cutsq.cast_copy(host_cutsq[0],host_write);
-
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
-    shared_types=true;
-  }
-  offset.safe_alloc(lj_types,lj_types,offset_get_texture<numtyp>());
-  offset.cast_copy2D(host_offset[0],host_write,ntypes,ntypes);
-  double *t1=host_lj1[0];
-  double *t2=host_lj2[0];
-  for (int i=0; i<ntypes*ntypes; i++) {
-    host_write[i*2]=t1[i];
-    host_write[i*2+1]=t2[i];
-  }
-  lj1.safe_alloc(lj_types,lj_types,lj1_get_texture<numtyp>());
-  lj1.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
-                       ntypes,ntypes);
-  t1=host_lj3[0];
-  t2=host_lj4[0];
-  for (int i=0; i<ntypes*ntypes; i++) {
-    host_write[i*2]=t1[i];
-    host_write[i*2+1]=t2[i];
-  }
-  lj3.safe_alloc(lj_types,lj_types,lj3_get_texture<numtyp>());
-  lj3.copy_2Dfrom_host(reinterpret_cast<typename nvc_vec_traits<numtyp>::vec2 *> (host_write.begin()),
-                       ntypes,ntypes);
-        
-  dev_error.safe_alloc(1);
-  dev_error.zero();
-    
-  allocated=true;
-  return true;
-}
-
-template <class numtyp, class acctyp>
-void LJ_GPU_MemoryT::clear() {
-  if (!allocated)
-    return;
-  allocated=false;
-      
-  // Check for any pair style specific errors here
-  int err_flag;
-  dev_error.copy_to_host(&err_flag);
- 
-  atom.clear();
-  nbor.clear();
-    
-  CUDA_SAFE_CALL(cudaStreamDestroy(pair_stream));
-
-  dev_error.clear();
-  sigma.clear();
-  epsilon.clear();
-  special_lj.clear();
-  cutsq.clear();
-  offset.clear();
-  lj1.clear();
-  lj3.clear();
-}  
- 
-template <class numtyp, class acctyp>
-double LJ_GPU_MemoryT::host_memory_usage() const {
-  return atom.host_memory_usage(max_atoms)+nbor.host_memory_usage()+
-         sizeof(LJ_GPU_Memory<numtyp,acctyp>);
-}
-
-template class LJ_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj_gpu_memory.h
+++ b/lib/gpu/lj_gpu_memory.h
@ -1,87 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors: Mike Brown (SNL), wmbrown@sandia.gov
-                         Peng Wang (Nvidia), penwang@nvidia.com
-                         Paul Crozier (SNL), pscrozi@sandia.gov
------------------------------------------------------------------------- */
-
-#ifndef LJ_GPU_MEMORY_H
-#define LJ_GPU_MEMORY_H
-
-#include "nvc_device.h"
-#include "nvc_traits.h"
-#include "pair_gpu_atom.h"
-#include "pair_gpu_nbor.h"
-
-#define BLOCK_1D 64           // max value = 256
-#define CELL_SIZE BLOCK_1D
-#define MAX_SHARED_TYPES 8
-#define BIG_NUMBER 100000000
-
-template <class numtyp, class acctyp>
-class LJ_GPU_Memory {
- public:
-  LJ_GPU_Memory() : allocated(false) {}
-  ~LJ_GPU_Memory() { clear(); }
-  
-  inline bool is_allocated() { return allocated; }
- 
-  /// Allocate memory on host and device
-  bool init(const int ij_size, const int ntypes, double **host_cutsq, 
-            double **host_sigma, double **host_epsilon, 
-            double **host_lj1, double **host_lj2, double **host_lj3, 
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int max_nbors, const int me, const int nlocal,
-            const int nall);
-  /// Free any memory on host and device
-  void clear();
-
-  /// Returns memory usage on GPU per atom
-  int bytes_per_atom(const int max_nbors) const;
-  /// Total host memory used by library
-  double host_memory_usage() const;
-  
-  // -------------------------   DATA   -----------------------------
-
-  // Device Properties
-  NVCDevice gpu;
-  // Device Error Flag
-  NVC_VecI dev_error;
-  // Stream for asynchronous work
-  cudaStream_t pair_stream;
-  
-  // Atom Data
-  PairGPUAtom<numtyp,acctyp> atom;
-  // Neighbor Data
-  PairGPUNbor nbor;
-  
-  // --------------- Const Data for Atoms
-  NVC_ConstMatT sigma, epsilon, cutsq, offset;
-  NVC_ConstMat< typename nvc_vec_traits<numtyp>::vec2 > lj1, lj3;
-  NVC_VecT special_lj;
-  
-  size_t max_atoms, max_local;
-  
-  // Timing for pair calculation
-  NVCTimer time_pair;
-  
-  // If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-   
- protected:
-  bool allocated;
-};
-
-#endif
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "ljc_cut_gpu_memory.h"
+
+using namespace std;
+
+static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double **host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e) {
+  LJCMF.clear();
+  gpu_mode=LJCMF.device->gpu_mode();
+  double gpu_split=LJCMF.device->particle_split();
+  int first_gpu=LJCMF.device->first_device();
+  int last_gpu=LJCMF.device->last_device();
+  int world_me=LJCMF.device->world_me();
+  int gpu_rank=LJCMF.device->gpu_rank();
+  int procs_per_gpu=LJCMF.device->procs_per_gpu();
+
+  LJCMF.device->init_message(screen,"lj/cut/coul/cut",first_gpu,last_gpu);
+
+  bool message=false;
+  if (world_me==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  if (world_me==0) {
+    bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                            host_lj4, offset, special_lj, inum, nall, 300,
+                            maxspecial, cell_size, gpu_split, screen,
+                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                            qqrd2e);
+    if (!init_ok)
+      return false;
+  }
+
+  MPI_Barrier(MPI_COMM_WORLD);
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",gpu_rank,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0) {
+      bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                              host_lj4, offset, special_lj, inum, nall, 300,
+                              maxspecial, cell_size, gpu_split,
+			      screen, host_cut_ljsq, host_cut_coulsq,
+                              host_special_coul, qqrd2e);
+      if (!init_ok)
+        return false;
+    }
+    MPI_Barrier(LJCMF.device->gpu_comm);
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return true;
+}
+
+void ljc_gpu_clear() {
+  LJCMF.clear();
+}
+
+int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q) {
+  return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
+                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, cpu_time, success, host_q);
+}  
+			
+void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
+	 	     const int nall, double **host_x, int *host_type,
+                     int *ilist, int *numj, int **firstneigh,
+		     const bool eflag, const bool vflag, const bool eatom,
+                     const bool vatom, int &host_start, const double cpu_time,
+                     bool &success, double *host_q) {
+  LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                host_q);
+}
+
+double ljc_gpu_bytes() {
+  return LJCMF.host_memory_usage();
+}
+
+
--- a/Show More
+++ b/Show More