git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@15268 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-07-06 23:26:44 +00:00 · 2016-07-06 23:26:44 +00:00 · 65c1e16401
parent fd27214f7d
commit 65c1e16401
174 changed files with 0 additions and 22457 deletions
--- a/lib/cuda/Makefile
+++ b/lib/cuda/Makefile
@ -1,4 +0,0 @@
-#Makefile for liblammpscuda.a 
-#No need to modify anything here! The CUDA path is inserted into Makefile.common
-
-include Makefile.cudalib
--- a/lib/cuda/Makefile.common
+++ b/lib/cuda/Makefile.common
@ -1,123 +0,0 @@
-#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed
-
-# make options:
-# emu=1        switch to cuda emulation mode (otherwise: use gpu)
-# dbg=1        print a lot of debugging output during runtime
-# verbose=1    output nvcc command line during compilation
-# keep=1       do not delete temporary compilation files (.ii, .cubin, ...)
-# cufft=1      use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw)
-# binning=1    create virtual particle grid (neighbor-lists otherwise); currently this is not supported
-# precision=1  single precision (global setting)
-# precision=2  double precision (global setting)
-
-SHELL = /bin/sh
-
-# System-specific settings
-
-CUDA_INSTALL_PATH = /usr/local/cuda
-#CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
-# e.g. in Gentoo
-# CUDA_INSTALL_PATH = /opt/cuda
-
-#//////////////////////////////////////////////////////////////////////////////////////////////
-# no need to change anything below this line
-#//////////////////////////////////////////////////////////////////////////////////////////////
-
-#use CPU FFT if cufft=0 is requested.
-FALLBACK_FFT = 1
-
-#default settings for compiler switches
-ifdef COMPILELIB 
-include Makefile.defaults
-else
-include ../../lib/cuda/Makefile.defaults
-endif
-
-#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer}
-
-CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX 
-CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64
-
-# debug setting
-ifeq ($(strip $(dbg)), 1)
-	CUDA_FLAGS += -D_DEBUG -g
-	NVCC_FLAGS += -g -G 
-else
-	NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O3
-endif
-
-# skip timing on Mac and Windows manually
-ifeq ($(strip $(prec_timer)), 0)
-	CUDA_FLAGS += -DNO_PREC_TIMING
-endif
-
-# set fft routine
-ifeq ($(strip $(cufft)), 0)
-	ifneq ($(FALLBACK_FFT), 1)
-	    FFT_INC = -DFFT_NONE
-	    FFT_PATH = 
-	    FFT_LIB = 
-		CUDA_FLAGS += -DFFT_NONE
-	endif
-else
-	CUDA_FLAGS += -DFFT_CUFFT
-	CUDA_USRLIB_CONDITIONAL += -lcufft
-endif
-
-# make global precision setting
-
-ifeq ($(strip $(precision)), 1)
-	CUDA_FLAGS += -DCUDA_PRECISION=1
-else
-	ifeq ($(strip $(precision)), 3)
-		CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2
-	else
-		ifeq ($(strip $(precision)), 4)
-			CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2
-		else
-			CUDA_FLAGS += -DCUDA_PRECISION=2
-		endif
-	endif
-endif
-
-# make architecture settings
-ifeq ($(strip $(arch)), 13)
-	CUDA_FLAGS += -DCUDA_ARCH=13
-	SMVERSIONFLAGS	:= -arch sm_13
-else
-  ifeq ($(strip $(arch)), 20)
-	 CUDA_FLAGS += -DCUDA_ARCH=20 
-	 #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-	 NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-	 SMVERSIONFLAGS	:= -arch sm_20
-  else
-     ifeq ($(strip $(arch)), 21)
-	   CUDA_FLAGS += -DCUDA_ARCH=20 
-	   #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-	   NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-	   SMVERSIONFLAGS	:= -arch sm_21
-     else
-       ifeq ($(strip $(arch)), 30)
-           CUDA_FLAGS += -DCUDA_ARCH=20
-           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-           SMVERSIONFLAGS       := -arch sm_30
-       else
-         ifeq ($(strip $(arch)), 35)
-           CUDA_FLAGS += -DCUDA_ARCH=20
-           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-           SMVERSIONFLAGS       := -arch sm_35
-         else         
-           CUDA_FLAGS += -DCUDA_ARCH=99  
-           SMVERSIONFLAGS	:= -arch sm_13
-         endif
-       endif
-     endif
-  endif
-endif
-
-
-
-CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \
-		-I$(CUDA_INSTALL_PATH)/include 
--- a/lib/cuda/Makefile.cudalib
+++ b/lib/cuda/Makefile.cudalib
@ -1,87 +0,0 @@
-#Makefile for liblammpscuda.a 
-#No need to modify anything here! The CUDA path is inserted into Makefile.common
-
-.DEFAULT: lib
-
-COMPILELIB := 1
-
-SHELL = /bin/sh
-
-CUDA_SRC_DIR = ../cuda
-CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake
-CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) )
-include $(CUDA_TEMP)
-CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu)
-CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o)
-CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO)) 
-CUDA_DEP = $(CUDA_OBJ:.o=.d)
-
-NVCC_FLAGS := 
-
-VPATH = $(CUDA_SRC_DIR)
-
-#rewriting default settings if new ones are specified
-
-
-ifdef precision
-tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults)
-endif
-
-ifdef arch
-tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults)
-endif
-
-ifdef cufft
-tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults)
-endif
-
-ifdef dbg
-tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults)
-endif
-
-ifdef prec_timer
-tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults)
-endif
-
-include Makefile.common
-
-tmp := $(shell sed -i '2 d' Makefile.lammps)
-tmp := $(shell sed -i '2 d' Makefile.lammps)
-tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps)
-tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps)
-
-# verbose nvcc output during compilation
-ifeq ($(verbose), 1)
-	VERBOSE :=
-	NVCC_FLAGS += --ptxas-options=-v
-else
-	VERBOSE := @
-endif
-
-# keep temporary compilation files of nvcc
-ifeq ($(keep), 1)
-	NVCC_FLAGS += -keep -Xptxas="--verbose"
-endif
-
-
-NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
-CUDA_INCLUDES =  -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA
-CUDA_USRLIB = 
-
-# Link target
-
-lib: $(CUDA_OBJ)
-	$(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a
-
-clean:
-	rm $(CUDA_SRC_DIR)/*.o
-	rm liblammpscuda.a
-	
-# Library target
-
-
-# Cuda compilation rules
-
-%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h
-	$(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $<
-
--- a/lib/cuda/Makefile.defaults
+++ b/lib/cuda/Makefile.defaults
@ -1,19 +0,0 @@
-
-#precision setting: 1 single, 2 double, 4 mixed
-precision ?= 2
-
-#verbose setting: 0 no, 1 yes
-verbose ?= 1
-
-#GPU architecture (compute capability): 13, 20, 21, 35
-arch ?= 21
-
-#Using cufft (should not be changed)
-cufft ?= 1
-
-#Using dbg mode 
-dbg ?= 0
-
-#On mac machines set this to 0 in order to avoid usage of linux specific precision timer
-prec_timer ?= 1
-
--- a/lib/cuda/Makefile.lammps
+++ b/lib/cuda/Makefile.lammps
@ -1,7 +0,0 @@
-# Settings that the LAMMPS build will import when this package library is used
-CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX  -DFFT_CUFFT -DCUDA_PRECISION=2 -DCUDA_ARCH=20 
-CUDA_USRLIB_CONDITIONAL := -L/usr/local/cuda/lib -L/usr/local/cuda/lib64
- 
-user-cuda_SYSINC = ${CUDA_FLAGS}
-user-cuda_SYSLIB =  -lcufft -lcuda -lcudart 
-user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL)
--- a/lib/cuda/README
+++ b/lib/cuda/README
@ -1,59 +0,0 @@
-USER-CUDA library
-Christian Trott, crtrott at sandia.gov
-
-------------------------------------------------------------------
-
-This directory has source files to build a library that LAMMPS links
-against when using the USER-CUDA package.
-
-This library must be built before LAMMPS is built, so LAMMPS can link
-against it.  The build process also write settings into the
-Makefile.lammps file which are used when files in the src/USER-CUDA
-package are compiled.
-
-Thus if you re-build this library (e.g. for a different precision),
-you MUST re-compile the src/USER-CUDA files as well.  You can force
-this to happen by uninstalling, then re-installing the USER-CUDA
-package (make no-user-cuda; make yes-user-cuda) before doing
-a LAMMPS build.
-
-Build this library in two steps.  First type:
-
-make OPTIONS
-
-where OPTIONS is one or more of the following settings:
-
-precision=N to set the precision level
-  N = 1 for single precision (default)
-  N = 2 for double precision
-  N = 3 for positions in double precision
-  N = 4 for positions and velocities in double precision
-arch=M to set GPU compute capability
-  M = 20 for CC2.0 (GF100/110, e.g. C2050,GTX580,GTX470) (default)
-  M = 21 for CC2.1 (GF104/114,  e.g. GTX560, GTX460, GTX450)
-  M = 13 for CC1.3 (GF200, e.g. C1060, GTX285)
-prec_timer=0/1 to use hi-precision timers
-  0 = do not use them (default)
-  1 = use these timers
-  this is usually only useful for Mac machines 
-dbg=0/1 to activate debug mode
-  0 = no debug mode (default)
-  1 = yes debug mode
-  this is only useful for developers
-cufft=1 to determine usage of CUDA FFT library
-  0 = no CUFFT support (default)
-  in the future other CUDA-enabled FFT libraries might be supported 
-
-This will write settings to the Makefile.defaults file.
-
-Then type "make" with with no arguments to build the library with the
-new settings.
-
-After the second make, two files should exist in this directory:
-
-liblammpscuda.a		the library LAMMPS will link against
-Makefile.lammps		settings the LAMMPS Makefile will import
-
-Makefile.lammps is created by the make command and will have settings
-consistent with the OPTIONS you selected.  It is used by the LAMMPS
-build, both for compile-time and link-time settings.
--- a/lib/cuda/atom_vec_angle_cuda.cu
+++ b/lib/cuda/atom_vec_angle_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int ANGLE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
-
-#include "atom_vec_angle_cuda_cu.h"
-
-void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
-}
-
-int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
-
-int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
--- a/lib/cuda/atom_vec_angle_cuda_cu.h
+++ b/lib/cuda/atom_vec_angle_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_
-#define ATOM_VEC_ANGLE_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
-extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-
-#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/
--- a/lib/cuda/atom_vec_atomic_cuda.cu
+++ b/lib/cuda/atom_vec_atomic_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int ATOMIC_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
-
-#include "atom_vec_atomic_cuda_cu.h"
-
-void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
-}
-
-int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
-
-int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
--- a/lib/cuda/atom_vec_atomic_cuda_cu.h
+++ b/lib/cuda/atom_vec_atomic_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_
-#define ATOM_VEC_ATOMIC_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
-extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-
-#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/
--- a/lib/cuda/atom_vec_charge_cuda.cu
+++ b/lib/cuda/atom_vec_charge_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int CHARGE_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
-
-#include "atom_vec_charge_cuda_cu.h"
-
-void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
-}
-
-int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
-
-int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
--- a/lib/cuda/atom_vec_charge_cuda_cu.h
+++ b/lib/cuda/atom_vec_charge_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_
-#define ATOM_VEC_CHARGE_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
-extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-
-#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/
--- a/lib/cuda/atom_vec_cuda.cu
+++ b/lib/cuda/atom_vec_cuda.cu
@ -1,628 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX atom_vec_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "cuda_wrapper_cu.h"
-#include "crm_cuda_utils.cu"
-
-#include "atom_vec_cuda_kernel.cu"
-
-int AtomVecCuda_CountDataItems(unsigned int data_mask)
-{
-  int n = 0;
-
-  if(data_mask & X_MASK) n += 3;
-
-  if(data_mask & V_MASK) n += 3;
-
-  if(data_mask & F_MASK) n += 3;
-
-  if(data_mask & TAG_MASK) n++;
-
-  if(data_mask & TYPE_MASK) n++;
-
-  if(data_mask & MASK_MASK) n++;
-
-  if(data_mask & IMAGE_MASK) n++;
-
-  if(data_mask & Q_MASK) n++;
-
-  if(data_mask & MOLECULE_MASK) n++;
-
-  if(data_mask & RMASS_MASK) n++;
-
-  if(data_mask & RADIUS_MASK) n++;
-
-  if(data_mask & DENSITY_MASK) n++;
-
-  if(data_mask & OMEGA_MASK) n += 3;
-
-  if(data_mask & TORQUE_MASK) n++;
-
-  //if(data_mask & NSPECIAL_MASK) n+=3;
-  return n;
-}
-
-void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
-{
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-}
-
-template <const unsigned int data_mask>
-void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(image)   , & sdata->atom.image.dev_data, sizeof(int*));
-
-  if(data_mask & Q_MASK) cudaMemcpyToSymbol(MY_AP(q)       , & sdata->atom.q    .dev_data, sizeof(F_CFLOAT*));
-
-  if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbol(MY_AP(molecule)   , & sdata->atom.molecule.dev_data, sizeof(int*));
-
-  if(data_mask & RADIUS_MASK) cudaMemcpyToSymbol(MY_AP(radius)   , & sdata->atom.radius.dev_data, sizeof(int*));
-
-  if(data_mask & DENSITY_MASK) cudaMemcpyToSymbol(MY_AP(density)   , & sdata->atom.density.dev_data, sizeof(int*));
-
-  if(data_mask & RMASS_MASK) cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(int*));
-
-  if(data_mask & OMEGA_MASK) cudaMemcpyToSymbol(MY_AP(omega)   , & sdata->atom.omega.dev_data, sizeof(int*));
-
-  //if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_AP(nspecial)   , & sdata->atom.nspecial.dev_data, sizeof(int*) );
-  cudaMemcpyToSymbol(MY_AP(flag)    , & sdata->flag, sizeof(int*));
-}
-
-template <const unsigned int data_mask>
-void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
-{
-  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n");)
-
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n");)
-  cudaMemcpyToSymbol(MY_AP(prd)   , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(sublo)   , & sdata->domain.sublo, 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(subhi)   , & sdata->domain.subhi, 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(flag)   , & sdata->flag, sizeof(int*));
-  cudaThreadSynchronize();
-  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n");)
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
-  int size = (n * n_data_items) * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    cudaMemset(sdata->flag, 0, sizeof(int));
-
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
-    Cuda_AtomVecCuda_PackComm_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
-        , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_kernel_pack +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
-
-    if(not sdata->overlap_comm)
-      cudaMemcpy(buf_send, sdata->buffer, n* n_data_items* sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-    //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_forward_download +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-    int aflag;
-    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
-
-  }
-
-  return n_data_items * n;
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
-  int size = (n * n_data_items) * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  static int count = -1;
-  count++;
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
-
-    Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_kernel_self +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed");
-  }
-
-  return n_data_items * n;
-}
-
-
-template <const unsigned int data_mask>
-void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
-{
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
-  int size = (n * n_data_items) * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    if(not sdata->overlap_comm || iswap < 0)
-      cudaMemcpy(sdata->buffer, (void*)buf_recv, n_data_items * n * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_upload +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-    void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
-    Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask> <<< grid, threads, 0>>>(n, first, buf);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_forward_kernel_unpack +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed");
-
-  }
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
-{
-  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n", dim);)
-  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed");
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  Cuda_AtomVecCuda_Init<data_mask>(sdata);
-  int size = n * sizeof(double);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
-
-  int3 layout = getgrid(sdata->atom.nlocal, sizeof(int), 256, true);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  my_times time1, time2;
-  my_gettime(CLOCK_REALTIME, &time1);
-
-  Cuda_AtomVecCuda_PackExchangeList_Kernel <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (n - 1, dim);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
-
-  my_gettime(CLOCK_REALTIME, &time2);
-  sdata->cuda_timings.comm_exchange_kernel_pack +=
-    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-  cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost);
-  int return_value = ((int*) buf_send)[0];
-
-  if(n > 1 + return_value)
-    cudaMemcpy(buf_send, sdata->buffer, (1 + return_value)*sizeof(double), cudaMemcpyDeviceToHost);
-
-  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
-
-  my_gettime(CLOCK_REALTIME, &time1);
-  sdata->cuda_timings.comm_exchange_download +=
-    time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n");)
-  return return_value;
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n");)
-
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  //if(sdata->atom.update_nlocal)
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
-  int size = (nsend * n_data_items + 1) * sizeof(double);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
-
-  int3 layout = getgrid(nsend, 0);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  my_times time1, time2;
-  my_gettime(CLOCK_REALTIME, &time1);
-
-  Cuda_AtomVecCuda_PackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(nsend, (int*) copylist);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
-
-  my_gettime(CLOCK_REALTIME, &time2);
-  sdata->cuda_timings.comm_exchange_kernel_pack +=
-    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-  cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
-
-  my_gettime(CLOCK_REALTIME, &time1);
-  sdata->cuda_timings.comm_exchange_download +=
-    time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-  MYDBG(printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n");)
-  return nsend * n_data_items + 1;
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask) + 1;
-
-  int size = (nsend * n_data_items + 1) * sizeof(double);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  cudaMemcpyToSymbol(MY_AP(flag)   , & sdata->flag, sizeof(int*));
-
-  cudaMemset((int*)(sdata->flag), 0, sizeof(int));
-
-  if(nsend) {
-    int3 layout = getgrid(nsend, 0);
-    dim3 threads(layout.z, 1, 1);
-    dim3 grid(layout.x, layout.y, 1);
-
-    if(sdata->atom.nlocal > 0) {
-      my_times time1, time2;
-      my_gettime(CLOCK_REALTIME, &time1);
-
-      cudaMemcpy(sdata->buffer, buf_send , size, cudaMemcpyHostToDevice);
-
-      my_gettime(CLOCK_REALTIME, &time2);
-      sdata->cuda_timings.comm_exchange_upload +=
-        time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-      Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask> <<< grid, threads, 0>>>(sdata->exchange_dim, nsend, (int*) copylist);
-      cudaThreadSynchronize();
-
-      my_gettime(CLOCK_REALTIME, &time1);
-      sdata->cuda_timings.comm_exchange_kernel_unpack +=
-        time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-      CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed");
-    }
-  }
-
-  int naccept;
-  cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-
-  return naccept;
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  my_times atime1, atime2;
-  my_gettime(CLOCK_REALTIME, &atime1);
-
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  my_gettime(CLOCK_REALTIME, &atime2);
-  sdata->cuda_timings.test1 +=
-    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
-
-  int size = nsend * n_data_items * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-  }
-
-  int3 layout = getgrid(nsend);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    my_times time1, time2;
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    Cuda_AtomVecCuda_PackBorder_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, nsend, sdata->comm.maxlistlength, iswap, dx, dy, dz);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_border_kernel_pack +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_border_download +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-  }
-
-  return nsend * n_data_items;
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
-
-  int size = n * n_data_items * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-  }
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    my_times time1, time2;
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask> <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_border_kernel_self +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed");
-
-  }
-
-  return n * n_data_items;
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  my_times atime1, atime2;
-  my_gettime(CLOCK_REALTIME, &atime1);
-
-  if(sdata->atom.update_nmax)
-    Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  my_gettime(CLOCK_REALTIME, &atime2);
-  sdata->cuda_timings.test1 +=
-    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-
-  int n_data_items = AtomVecCuda_CountDataItems(data_mask);
-
-  int size = n * n_data_items * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_AtomVecCuda_UpdateBuffer(sdata, size);
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    my_times time1, time2;
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    cudaMemset((int*)(sdata->flag), 0, sizeof(int));
-    cudaMemcpy(sdata->buffer, (void*)buf_recv, size, cudaMemcpyHostToDevice);
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_border_upload +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask> <<< grid, threads, 0>>>(n, first);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_border_kernel_unpack +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-    cudaMemcpy(&sdata->comm.grow_flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-
-    CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed");
-
-  }
-
-  return sdata->comm.grow_flag;
-}
-
-
-#include "atom_vec_angle_cuda.cu"
-#include "atom_vec_atomic_cuda.cu"
-#include "atom_vec_charge_cuda.cu"
-#include "atom_vec_full_cuda.cu"
-//#include "atom_vec_granular_cuda.cu"
--- a/lib/cuda/atom_vec_cuda_cu.h
+++ b/lib/cuda/atom_vec_cuda_cu.h
--- a/lib/cuda/atom_vec_cuda_kernel.cu
+++ b/lib/cuda/atom_vec_cuda_kernel.cu
@ -1,512 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-#define RIMLARGER 1.000001
-#define RIMSMALLER 0.999999
-#define SMALL 1e-5
-
-extern __shared__ int shared[];
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = list[i];
-
-    if(j > _nmax) _flag[0] = 1;
-
-    int k = 0;
-
-    if(data_mask & X_MASK) {
-      ((X_CFLOAT*) buffer)[i + k * n] = _x[j] + dx;
-      k++;
-      ((X_CFLOAT*) buffer)[i + k * n] = _x[j + _nmax] + dy;
-      k++;
-      ((X_CFLOAT*) buffer)[i + k * n] = _x[j + 2 * _nmax] + dz;
-      k++;
-    }
-
-    if(data_mask & V_MASK) {
-      ((X_CFLOAT*) buffer)[i + k * n] = _v[j];
-      k++;
-      ((X_CFLOAT*) buffer)[i + k * n] = _v[j + _nmax];
-      k++;
-      ((X_CFLOAT*) buffer)[i + k * n] = _v[j + 2 * _nmax];
-      k++;
-    }
-
-    if(data_mask & OMEGA_MASK) {
-      ((X_CFLOAT*) buffer)[i + k * n] = _omega[j];
-      k++;
-      ((X_CFLOAT*) buffer)[i + k * n] = _omega[j + _nmax];
-      k++;
-      ((X_CFLOAT*) buffer)[i + k * n] = _omega[j + 2 * _nmax];
-      k++;
-    }
-
-    if(data_mask & RADIUS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _radius[j];
-
-    k++;
-
-    if(data_mask & RMASS_MASK)((X_CFLOAT*) buffer)[i + k * n] = _rmass[j];
-
-    k++;
-  }
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = i;
-    j = list[i];
-
-    if(data_mask & X_MASK) {
-      _x[i + first] = _x[j] + dx;
-      _x[i + first + _nmax] = _x[j + _nmax] + dy;
-      _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
-    }
-
-    if(data_mask & V_MASK) {
-      _v[i + first] = _v[j];
-      _v[i + first + _nmax] = _v[j + _nmax];
-      _v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
-    }
-
-    if(data_mask & OMEGA_MASK) {
-      _omega[i + first] = _omega[j];
-      _omega[i + first + _nmax] = _omega[j + _nmax];
-      _omega[i + first + 2 * _nmax] = _omega[j + 2 * _nmax];
-    }
-
-    if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
-
-    if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
-  }
-}
-
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n, int first, void* buffer)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < n) {
-    int k = 0;
-
-    if(data_mask & X_MASK) {
-      _x[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-      _x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-      _x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-    }
-
-    if(data_mask & V_MASK) {
-      _v[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-      _v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-      _v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-    }
-
-    if(data_mask & OMEGA_MASK) {
-      _omega[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-      _omega[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-      _omega[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + k * n];
-      k++;
-    }
-
-    if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
-
-    k++;
-
-    if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) buffer)[i + k * n];
-
-    k++;
-  }
-}
-
-
-__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n, int dim)
-{
-  double* buf = (double*) _buffer;
-  buf = &buf[1];
-
-  //X_CFLOAT lo=slablo[iswap];
-  //X_CFLOAT hi=slabhi[iswap];
-
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  bool add = false;
-
-  if(i < _nlocal) {
-    double xdim_tmp = static_cast <double>(_x[i + dim * _nmax]);
-
-    if(xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) {
-      add = true;
-    }
-  }
-
-  shared[threadIdx.x] = add ? 1 : 0;
-  __syncthreads();
-  int nsend = 0;
-
-  if(threadIdx.x == 0) {
-    for(int k = 0; k < blockDim.x; k++) {
-      if(shared[k]) {
-        nsend++;
-        shared[k] = nsend;
-      }
-    }
-
-    shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
-  }
-
-  __syncthreads();
-
-  nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
-
-  if(add && nsend + 1 < n)
-    buf[nsend] = i;
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist)
-{
-  double* buf = (double*) _buffer;
-  int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(k >= nsend) return;
-
-  buf = &buf[1 + k];
-
-  int i = static_cast <int>(buf[0]);
-  int j = copylist[k];
-
-  int m = 1;
-
-  if(data_mask & X_MASK) {
-    buf[(m++)*nsend] = static_cast <double>(_x[i]);
-    buf[(m++)*nsend] = static_cast <double>(_x[i + _nmax]);
-    buf[(m++)*nsend] = static_cast <double>(_x[i + 2 * _nmax]);
-  }
-
-  if(data_mask & V_MASK) {
-    buf[(m++)*nsend] = _v[i];
-    buf[(m++)*nsend] = _v[i + _nmax];
-    buf[(m++)*nsend] = _v[i + 2 * _nmax];
-  }
-
-  if(data_mask & TAG_MASK) 		buf[(m++)*nsend] = _tag[i];
-
-  if(data_mask & TYPE_MASK) 	buf[(m++)*nsend] = _type[i];
-
-  if(data_mask & MASK_MASK) 	buf[(m++)*nsend] = _mask[i];
-
-  if(data_mask & IMAGE_MASK) 	buf[(m++)*nsend] = _image[i];
-
-  if(data_mask & Q_MASK) 		buf[(m++)*nsend] = _q[i];
-
-  if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i];
-
-  if(data_mask & RADIUS_MASK) 	buf[(m++)*nsend] = _radius[i];
-
-  if(data_mask & DENSITY_MASK) 	buf[(m++)*nsend] = _density[i];
-
-  if(data_mask & RMASS_MASK) 	buf[(m++)*nsend] = _rmass[i];
-
-  if(data_mask & OMEGA_MASK) {
-    buf[(m++)*nsend] = _omega[i];
-    buf[(m++)*nsend] = _omega[i + _nmax];
-    buf[(m++)*nsend] = _omega[i + 2 * _nmax];
-  }
-
-  /*  if(data_mask & NSPECIAL_MASK)
-    {
-    	buf[(m++)*nsend] = _nspecial[i];
-    	buf[(m++)*nsend] = _nspecial[i+_nmax];
-    	buf[(m++)*nsend] = _nspecial[i+2* _nmax];
-    }*/
-
-  if(i >= _nlocal) return;
-
-  if(data_mask & X_MASK) {
-    _x[i] = _x[j];
-    _x[i + _nmax] = _x[j + _nmax];
-    _x[i + 2 * _nmax] = _x[j + 2 * _nmax];
-  }
-
-  if(data_mask & V_MASK) {
-    _v[i] = _v[j];
-    _v[i + _nmax] = _v[j + _nmax];
-    _v[i + 2 * _nmax] = _v[j + 2 * _nmax];
-  }
-
-  if(data_mask & TAG_MASK)		_tag[i] 	= _tag[j];
-
-  if(data_mask & TYPE_MASK)		_type[i] 	= _type[j];
-
-  if(data_mask & MASK_MASK)		_mask[i] 	= _mask[j];
-
-  if(data_mask & IMAGE_MASK)	_image[i] 	= _image[j];
-
-  if(data_mask & Q_MASK) 		_q[i] 		= _q[j];
-
-  if(data_mask & MOLECULE_MASK) _molecule[i] = _molecule[j];
-
-  if(data_mask & RADIUS_MASK) 	_radius[i] 	= _radius[j];
-
-  if(data_mask & DENSITY_MASK) 	_density[i] = _density[j];
-
-  if(data_mask & RMASS_MASK) 	_rmass[i] 	= _rmass[j];
-
-  if(data_mask & OMEGA_MASK) {
-    _omega[i] = _omega[j];
-    _omega[i + _nmax] = _omega[j + _nmax];
-    _omega[i + 2 * _nmax] = _omega[j + 2 * _nmax];
-  }
-
-  /* if(data_mask & NSPECIAL_MASK)
-  {
-  _nspecial[i] = _nspecial[j];
-  _nspecial[i+_nmax] = _nspecial[j+_nmax];
-  _nspecial[i+2* _nmax] = _nspecial[j+2* _nmax];
-  }*/
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim, int nsend, int* copylist)
-{
-  double* buf = (double*) _buffer;
-  int k = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(k >= nsend) return;
-
-  buf = &buf[1 + k];
-  int i = -1;
-  double xdim_tmp = buf[(1 + dim) * nsend];
-
-  if(xdim_tmp >= _sublo[dim] - SMALL && xdim_tmp < _subhi[dim] + SMALL) {
-    i = atomicAdd(_flag, 1) + _nlocal;
-
-    int m = 1;
-
-    if(data_mask & X_MASK) {
-      _x[i] = buf[(m++) * nsend];
-      _x[i + _nmax] = buf[(m++) * nsend];
-      _x[i + 2 * _nmax] = buf[(m++) * nsend];
-    }
-
-    if(data_mask & V_MASK) {
-      _v[i] = buf[(m++) * nsend];
-      _v[i + _nmax] = buf[(m++) * nsend];
-      _v[i + 2 * _nmax] = buf[(m++) * nsend];
-    }
-
-    if(data_mask & TAG_MASK) 	_tag[i] = buf[(m++) * nsend];
-
-    if(data_mask & TYPE_MASK) 	_type[i] = buf[(m++) * nsend];
-
-    if(data_mask & MASK_MASK) 	_mask[i] = buf[(m++) * nsend];
-
-    if(data_mask & IMAGE_MASK) _image[i] = buf[(m++) * nsend];
-
-    if(data_mask & Q_MASK) _q[i] = buf[(m++) * nsend];
-
-    if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++) * nsend];
-
-    if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++) * nsend];
-
-    if(data_mask & DENSITY_MASK) _density[i] = buf[(m++) * nsend];
-
-    if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++) * nsend];
-
-    if(data_mask & OMEGA_MASK) {
-      _omega[i] = buf[(m++) * nsend];
-      _omega[i + _nmax] = buf[(m++) * nsend];
-      _omega[i + 2 * _nmax] = buf[(m++) * nsend];
-    }
-
-    /*  if(data_mask & NSPECIAL_MASK)
-      {
-       _nspecial[i] = buf[(m++)*nsend];
-       _nspecial[i+_nmax] = buf[(m++)*nsend];
-       _nspecial[i+2*_nmax] = buf[(m++)*nsend];
-      }*/
-  }
-
-  copylist[k] = i;
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = list[i];
-    int m = 0;
-
-    if(data_mask & X_MASK) {
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j] + dx;
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + _nmax] + dy;
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _x[j + 2 * _nmax] + dz;
-    }
-
-    if(data_mask & V_MASK) {
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j];
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + _nmax];
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _v[j + 2 * _nmax];
-    }
-
-    if(data_mask & TAG_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _tag[j];
-
-    if(data_mask & TYPE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _type[j];
-
-    if(data_mask & MASK_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _mask[j];
-
-    if(data_mask & Q_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _q[j];
-
-    if(data_mask & MOLECULE_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _molecule[j];
-
-    if(data_mask & RADIUS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _radius[i];
-
-    if(data_mask & DENSITY_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _density[i];
-
-    if(data_mask & RMASS_MASK)((X_CFLOAT*) _buffer)[i + (m++)*n] = _rmass[i];
-
-    if(data_mask & OMEGA_MASK) {
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i];
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + _nmax];
-      ((X_CFLOAT*) _buffer)[i + (m++)*n] = _omega[i + 2 * _nmax];
-    }
-  }
-}
-
-
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = list[i];
-
-    if(data_mask & X_MASK) {
-      _x[i + first] = _x[j] + dx;
-      _x[i + first + _nmax] = _x[j + _nmax] + dy;
-      _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
-    }
-
-    if(data_mask & V_MASK) {
-      _v[i + first] = _v[j];
-      _v[i + first + _nmax] = _v[j + _nmax];
-      _v[i + first + 2 * _nmax] =  _v[j + 2 * _nmax];
-    }
-
-    if(data_mask & TAG_MASK) _tag[i + first] = _tag[j];
-
-    if(data_mask & TYPE_MASK) _type[i + first] = _type[j];
-
-    if(data_mask & MASK_MASK) _mask[i + first] = _mask[j];
-
-    if(data_mask & Q_MASK) _q[i + first] = _q[j];
-
-    if(data_mask & MOLECULE_MASK) _molecule[i + first] = _molecule[j];
-
-    if(data_mask & RADIUS_MASK) _radius[i + first] = _radius[j];
-
-    if(data_mask & DENSITY_MASK) _density[i + first] = _density[j];
-
-    if(data_mask & RMASS_MASK) _rmass[i + first] = _rmass[j];
-
-    if(data_mask & OMEGA_MASK) {
-      _omega[i + first] = _omega[j];
-      _omega[i + first + _nmax] = _omega[j + _nmax];
-      _omega[i + first + 2 * _nmax] =  _omega[j + 2 * _nmax];
-    }
-  }
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n, int first)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < n) {
-    if(i + first < _nmax) {
-      int m = 0;
-
-      if(data_mask & X_MASK) {
-        _x[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-        _x[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-        _x[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-      }
-
-      if(data_mask & V_MASK) {
-        _v[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-        _v[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-        _v[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-      }
-
-      if(data_mask & TAG_MASK) _tag[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
-
-      if(data_mask & TYPE_MASK) _type[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
-
-      if(data_mask & MASK_MASK) _mask[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
-
-      if(data_mask & Q_MASK) _q[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-
-      if(data_mask & MOLECULE_MASK) _molecule[i + first] = static_cast<int>(((X_CFLOAT*) _buffer)[i + (m++) * n]);
-
-      if(data_mask & RADIUS_MASK) _radius[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-
-      if(data_mask & DENSITY_MASK) _density[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-
-      if(data_mask & RMASS_MASK) _rmass[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-
-      if(data_mask & OMEGA_MASK) {
-        _omega[i + first] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-        _omega[i + first + _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-        _omega[i + first + 2 * _nmax] = ((X_CFLOAT*) _buffer)[i + (m++) * n];
-      }
-    } else {
-      _flag[0] = 1;
-    }
-  }
-}
-
-
--- a/lib/cuda/atom_vec_full_cuda.cu
+++ b/lib/cuda/atom_vec_full_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int FULL_DATA_MASK = X_MASK | V_MASK | F_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
-
-#include "atom_vec_full_cuda_cu.h"
-
-void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata, n, dim, buf_send);
-}
-
-int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata, nsend, buf_send, copylist);
-}
-
-int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata, nsend, iswap, buf_send, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata, n, iswap, first, pbc, pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
-
-int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  const unsigned int data_mask = X_MASK | V_MASK | TAG_MASK | TYPE_MASK | MASK_MASK | Q_MASK | MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata, n, first, buf_recv);
-}
--- a/lib/cuda/atom_vec_full_cuda_cu.h
+++ b/lib/cuda/atom_vec_full_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_FULL_CUDA_CU_H_
-#define ATOM_VEC_FULL_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata, int n, int dim, void* buf_send);
-extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata, int nsend, void* buf_send, void* copylist);
-extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata, int nsend, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-
-#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/
--- a/lib/cuda/comm_cuda.cu
+++ b/lib/cuda/comm_cuda.cu
@ -1,539 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX comm_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "comm_cuda_cu.h"
-#include "comm_cuda_kernel.cu"
-#include <ctime>
-
-void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata, int n)
-{
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-}
-
-
-void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
-}
-
-
-void Cuda_CommCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_CommCuda_UpdateNmax(sdata);
-  int ntypesp = sdata->atom.ntypes + 1;
-  cudaMemcpyToSymbol(MY_AP(cuda_ntypes)   , &ntypesp, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(prd)   , sdata->domain.prd, 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(flag)  , &sdata->flag, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(debugdata)  , &sdata->debugdata, sizeof(int*));
-}
-
-int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    cudaMemset(sdata->flag, 0, sizeof(int));
-
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
-    Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
-        , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_kernel_pack +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-
-    if(not sdata->overlap_comm)
-      cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-    //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_forward_download +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-    int aflag;
-    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-
-  }
-
-  return 3 * n;
-}
-
-int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 6 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    cudaMemset(sdata->flag, 0, sizeof(int));
-
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    void* buf = sdata->overlap_comm ? sdata->comm.buf_send_dev[iswap] : sdata->buffer;
-    Cuda_CommCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n
-        , sdata->comm.maxlistlength, iswap, dx, dy, dz, buf);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_kernel_pack +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-
-    if(not sdata->overlap_comm)
-      cudaMemcpy(buf_send, sdata->buffer, n * 6 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-    //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_forward_download +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-    int aflag;
-    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-
-  }
-
-  return 6 * n;
-}
-
-int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-  static int count = -1;
-  count++;
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_kernel_self +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
-  }
-
-  return 3 * n;
-}
-
-int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 6 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-  static int count = -1;
-  count++;
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    Cuda_CommCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_kernel_self +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
-  }
-
-  return 6 * n;
-}
-
-void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
-{
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    if(not sdata->overlap_comm || iswap < 0)
-      cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_upload +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-    void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
-    Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_forward_kernel_unpack +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
-
-  }
-}
-
-void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap)
-{
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 6 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    my_gettime(CLOCK_REALTIME, &time1);
-
-    if(not sdata->overlap_comm || iswap < 0)
-      cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 6 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
-
-    my_gettime(CLOCK_REALTIME, &time2);
-    sdata->cuda_timings.comm_forward_upload +=
-      time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-    void* buf = (sdata->overlap_comm && iswap >= 0) ? sdata->comm.buf_recv_dev[iswap] : sdata->buffer;
-    Cuda_CommCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first, buf);
-    cudaThreadSynchronize();
-
-    my_gettime(CLOCK_REALTIME, &time1);
-    sdata->cuda_timings.comm_forward_kernel_unpack +=
-      time1.tv_sec - time2.tv_sec + 1.0 * (time1.tv_nsec - time2.tv_nsec) / 1000000000;
-
-    CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
-
-  }
-}
-
-int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(F_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-
-  F_CFLOAT* buf = (F_CFLOAT*)buf_send;
-  F_CFLOAT* f_dev = (F_CFLOAT*)sdata->atom.f.dev_data;
-  f_dev += first;
-  cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
-  buf += n;
-  f_dev += sdata->atom.nmax;
-  cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
-  buf += n;
-  f_dev += sdata->atom.nmax;
-  cudaMemcpy(buf, f_dev, n * sizeof(F_CFLOAT), cudaMemcpyDeviceToHost);
-  return 	n * 3;
-}
-
-
-void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(F_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    cudaMemcpy(sdata->buffer, buf_recv, size, cudaMemcpyHostToDevice);
-    Cuda_CommCuda_UnpackReverse_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");
-  }
-}
-
-void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, n);
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    Cuda_CommCuda_UnpackReverse_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, first);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
-
-  }
-}
-
-
-int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap)
-{
-  MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
-  my_times time1, time2;
-
-  if(sdata->atom.update_nmax)
-    Cuda_CommCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  if(sdata->buffer_new or (80 > sdata->buffersize))
-    Cuda_CommCuda_UpdateBuffer(sdata, 10);
-
-  int n;
-
-  if(!bordergroup || ineed >= 2)
-    n = nlast - nfirst + 1;
-  else {
-    n = atom_nfirst;
-
-    if(nlast - sdata->atom.nlocal + 1 > n) n = nlast - sdata->atom.nlocal + 1;
-  }
-
-  int3 layout = getgrid(n, 0, 512, true);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x + 1, layout.y, 1);
-
-
-  cudaMemset((int*)(sdata->buffer), 0, sizeof(int));
-
-  my_gettime(CLOCK_REALTIME, &time1);
-
-  if(style == 1)
-    Cuda_CommCuda_BuildSendlist_Single <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.slablo.dev_data, (X_CFLOAT*) sdata->comm.slabhi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
-  else
-    Cuda_CommCuda_BuildSendlist_Multi <<< grid, threads, (threads.x + 1)*sizeof(int) >>> (bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap, (X_CFLOAT*) sdata->comm.multilo.dev_data, (X_CFLOAT*) sdata->comm.multihi.dev_data, (int*) sdata->comm.sendlist.dev_data, sdata->comm.maxlistlength);
-
-  cudaThreadSynchronize();
-  my_gettime(CLOCK_REALTIME, &time2);
-  sdata->cuda_timings.comm_border_kernel_buildlist +=
-    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-
-  CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
-  int nsend;
-  cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
-  return nsend;
-
-
-}
-
--- a/lib/cuda/comm_cuda_cu.h
+++ b/lib/cuda/comm_cuda_cu.h
@ -1,35 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
-extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbcflag);
-extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
-extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbcflag);
-extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
-extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata, int n, int first, void* buf_recv, int iswap = -1);
-extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata, int n, int first, void* buf_send);
-extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata, int n, int iswap, void* buf_recv);
-extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata, int n, int iswap, int first);
-extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata, int bordergroup, int ineed, int style, int atom_nfirst, int nfirst, int nlast, int dim, int iswap);
--- a/lib/cuda/comm_cuda_kernel.cu
+++ b/lib/cuda/comm_cuda_kernel.cu
@ -1,394 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = list[i];
-
-    if(j > _nmax) _flag[0] = 1;
-
-    ((X_CFLOAT*) buffer)[i] = _x[j] + dx;
-    ((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
-    ((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
-  }
-}
-
-__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, void* buffer)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = list[i];
-
-    if(j > _nmax) _flag[0] = 1;
-
-    ((X_CFLOAT*) buffer)[i] = _x[j] + dx;
-    ((X_CFLOAT*) buffer)[i + 1 * n] = _x[j + _nmax] + dy;
-    ((X_CFLOAT*) buffer)[i + 2 * n] = _x[j + 2 * _nmax] + dz;
-    ((X_CFLOAT*) buffer)[i + 3 * n] = _v[j];
-    ((X_CFLOAT*) buffer)[i + 4 * n] = _v[j + _nmax];
-    ((X_CFLOAT*) buffer)[i + 5 * n] = _v[j + 2 * _nmax];
-  }
-}
-
-__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = i;
-    j = list[i];
-
-    _x[i + first] = _x[j] + dx;
-    _x[i + first + _nmax] = _x[j + _nmax] + dy;
-    _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
-  }
-}
-
-__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, X_CFLOAT dx, X_CFLOAT dy, X_CFLOAT dz, int first)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = i;
-    j = list[i];
-
-    _x[i + first] = _x[j] + dx;
-    _x[i + first + _nmax] = _x[j + _nmax] + dy;
-    _x[i + first + 2 * _nmax] = _x[j + 2 * _nmax] + dz;
-    _v[i + first] = _v[j];
-    _v[i + first + _nmax] = _v[j + _nmax];
-    _v[i + first + 2 * _nmax] = _v[j + 2 * _nmax];
-  }
-}
-
-__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n, int first, void* buffer)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < n) {
-    _x[i + first] = ((X_CFLOAT*) buffer)[i];
-    _x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
-    _x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
-  }
-}
-
-
-__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n, int first, void* buffer)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < n) {
-    _x[i + first] = ((X_CFLOAT*) buffer)[i];
-    _x[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 1 * n];
-    _x[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 2 * n];
-    _v[i + first] = ((X_CFLOAT*) buffer)[i + 3 * n];
-    _v[i + first + _nmax] = ((X_CFLOAT*) buffer)[i + 4 * n];
-    _v[i + first + 2 * _nmax] = ((X_CFLOAT*) buffer)[i + 5 * n];
-  }
-}
-
-__global__ void Cuda_CommCuda_PackReverse_Kernel(int n, int first)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < n) {
-    ((F_CFLOAT*) _buffer)[i] = _f[i + first];
-    ((F_CFLOAT*) _buffer)[i + n] = _f[i + first + _nmax];
-    ((F_CFLOAT*) _buffer)[i + 2 * n] = _f[i + first + 2 * _nmax];
-  }
-
-}
-
-__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist, int n, int maxlistlength, int iswap)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = list[i];
-    _f[j] += ((F_CFLOAT*)_buffer)[i];
-    _f[j + _nmax] += ((F_CFLOAT*) _buffer)[i + n];
-    _f[j + 2 * _nmax] += ((F_CFLOAT*) _buffer)[i + 2 * n];
-  }
-
-}
-
-__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist, int n, int maxlistlength, int iswap, int first)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* list = sendlist + iswap * maxlistlength;
-
-  if(i < n) {
-    int j = list[i];
-
-    _f[j] += _f[i + first];
-    _f[j + _nmax] += _f[i + first + _nmax];
-    _f[j + 2 * _nmax] += _f[i + first + 2 * _nmax];
-  }
-
-}
-
-extern __shared__ int shared[];
-
-__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup, int ineed, int atom_nfirst,
-    int nfirst, int nlast, int dim, int iswap, X_CFLOAT* slablo, X_CFLOAT* slabhi, int* sendlist, int maxlistlength)
-{
-  int* list = sendlist + iswap * maxlistlength;
-  X_CFLOAT lo = slablo[iswap];
-  X_CFLOAT hi = slabhi[iswap];
-  bool add = false;
-
-  if(!bordergroup || ineed >= 2) {
-    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
-
-    if(i < nlast)
-      if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
-        add = true;
-      }
-
-    shared[threadIdx.x] = add ? 1 : 0;
-
-    __syncthreads();
-
-    int nsend = 0;
-
-    if(threadIdx.x == 0) {
-      for(int k = 0; k < blockDim.x; k++) {
-        if(shared[k]) {
-          nsend++;
-          shared[k] = nsend;
-        }
-      }
-
-      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
-    }
-
-    __syncthreads();
-
-    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
-
-    if(add && nsend < maxlistlength)
-      list[nsend] = i;
-
-
-  } else {
-
-    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-    if(i < atom_nfirst)
-      if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
-        add = true;
-      }
-
-    shared[threadIdx.x] = add ? 1 : 0;
-
-    __syncthreads();
-
-    int nsend = 0;
-
-    if(threadIdx.x == 0) {
-      for(int k = 0; k < blockDim.x; k++) {
-        if(shared[k]) {
-          nsend++;
-          shared[k] = nsend;
-        }
-      }
-
-      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
-    }
-
-    __syncthreads();
-
-    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
-
-    if(add && nsend < maxlistlength)
-      list[nsend] = i;
-
-    __syncthreads();
-
-    add = false;
-    i += _nlocal;
-
-    if(i < nlast)
-      if(_x[i + dim * _nmax] >= lo && _x[i + dim * _nmax] <= hi) {
-        add = true;
-      }
-
-    shared[threadIdx.x] = add ? 1 : 0;
-
-    __syncthreads();
-
-    nsend = 0;
-
-    if(threadIdx.x == 0) {
-      for(int k = 0; k < blockDim.x; k++) {
-        if(shared[k]) {
-          nsend++;
-          shared[k] = nsend;
-        }
-      }
-
-      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
-    }
-
-    __syncthreads();
-
-    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
-
-    if(add && nsend < maxlistlength)
-      list[nsend] = i;
-
-  }
-}
-
-
-__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup, int ineed, int atom_nfirst
-    , int nfirst, int nlast, int dim, int iswap, X_CFLOAT* multilo, X_CFLOAT* multihi, int* sendlist, int maxlistlength)
-{
-  int* list = sendlist + iswap * maxlistlength;
-  X_CFLOAT* mlo = &multilo[iswap * _cuda_ntypes];
-  X_CFLOAT* mhi = &multihi[iswap * _cuda_ntypes];
-  int itype = 0;
-  bool add = false;
-
-  if(!bordergroup || ineed >= 2) {
-    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x + nfirst;
-
-    if(i < nlast) {
-      itype = _type[i];
-
-      if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
-        add = true;
-      }
-    }
-
-    shared[threadIdx.x] = add ? 1 : 0;
-
-    __syncthreads();
-
-    int nsend = 0;
-
-    if(threadIdx.x == 0) {
-      for(int k = 0; k < blockDim.x; k++) {
-        if(shared[k]) {
-          nsend++;
-          shared[k] = nsend;
-        }
-      }
-
-      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
-    }
-
-    __syncthreads();
-
-    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
-
-    if(add && nsend < maxlistlength)
-      list[nsend] = i;
-
-
-  } else {
-
-    int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-    if(i < atom_nfirst) {
-      itype = _type[i];
-
-      if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
-        add = true;
-      }
-    }
-
-    shared[threadIdx.x] = add ? 1 : 0;
-
-    __syncthreads();
-
-    int nsend = 0;
-
-    if(threadIdx.x == 0) {
-      for(int k = 0; k < blockDim.x; k++) {
-        if(shared[k]) {
-          nsend++;
-          shared[k] = nsend;
-        }
-      }
-
-      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
-    }
-
-    __syncthreads();
-
-    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
-
-    if(add && nsend < maxlistlength)
-      list[nsend] = i;
-
-    __syncthreads();
-
-    add = false;
-    i += _nlocal;
-
-    if(i < nlast) {
-      itype = _type[i];
-
-      if(_x[i + dim * _nmax] >= mlo[itype] && _x[i + dim * _nmax] <= mhi[itype]) {
-        add = true;
-      }
-    }
-
-    shared[threadIdx.x] = add ? 1 : 0;
-
-    __syncthreads();
-
-    nsend = 0;
-
-    if(threadIdx.x == 0) {
-      for(int k = 0; k < blockDim.x; k++) {
-        if(shared[k]) {
-          nsend++;
-          shared[k] = nsend;
-        }
-      }
-
-      shared[blockDim.x] = atomicAdd((int*) _buffer, nsend);
-    }
-
-    __syncthreads();
-
-    nsend = shared[blockDim.x] + shared[threadIdx.x] - 1;
-
-    if(add && nsend < maxlistlength)
-      list[nsend] = i;
-
-  }
-}
--- a/lib/cuda/compute_temp_cuda.cu
+++ b/lib/cuda/compute_temp_cuda.cu
@ -1,126 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX compute_temp_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "compute_temp_cuda_cu.h"
-#include "compute_temp_cuda_kernel.cu"
-
-void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
-
-  if(sdata->atom.rmass_flag)
-    cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
-
-  cudaMemcpyToSymbol(MY_AP(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)       , & sdata->atom.type    .dev_data, sizeof(int*));
-}
-
-void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_ComputeTempCuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
-{
-  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-  Cuda_ComputeTempCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  //if(sdata->buffer_new)
-  Cuda_ComputeTempCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    Cuda_ComputeTempCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
-
-    int oldgrid = grid.x * grid.y;
-    grid.x = 6;
-    grid.y = 1;
-    threads.x = 512;
-    Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
-  }
-}
-
-void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t)
-{
-  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-  Cuda_ComputeTempCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  //if(sdata->buffer_new)
-  Cuda_ComputeTempCuda_UpdateBuffer(sdata);
-  MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n", sdata->atom.nlocal);)
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
-    Cuda_ComputeTempCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
-
-    int oldgrid = grid.x * grid.y;
-    grid.x = 1;
-    grid.y = 1;
-    threads.x = 512;
-    Cuda_ComputeTempCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
-  }
-}
--- a/lib/cuda/compute_temp_cuda_cu.h
+++ b/lib/cuda/compute_temp_cuda_cu.h
@ -1,28 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
-extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t);
--- a/lib/cuda/compute_temp_cuda_kernel.cu
+++ b/lib/cuda/compute_temp_cuda_kernel.cu
@ -1,118 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ ENERGY_CFLOAT sharedmem[];
-
-
-__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  sharedmem[threadIdx.x] = 0;
-
-  if(i < _nlocal) {
-    if(_rmass_flag) {
-      if(_mask[i] & groupbit)
-        sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * _rmass[i];
-    } else {
-      if(_mask[i] & groupbit)
-        sharedmem[threadIdx.x] = (_v[i] * _v[i] + _v[i + _nmax] * _v[i + _nmax] + _v[i + 2 * _nmax] * _v[i + 2 * _nmax]) * (_mass[_type[i]]);
-    }
-  }
-
-  reduceBlock(sharedmem);
-  ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
-  }
-}
-
-__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  sharedmem[threadIdx.x] = 0;
-  sharedmem[threadIdx.x + blockDim.x] = 0;
-  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      V_CFLOAT massone;
-
-      if(_rmass_flag) massone = _rmass[i];
-      else massone = _mass[_type[i]];
-
-      sharedmem[threadIdx.x] = massone * _v[i] * _v[i];
-      sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax];
-      sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax];
-      sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax];
-      sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax];
-      sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax];
-    }
-
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2 * blockDim.x]);
-  reduceBlock(&sharedmem[3 * blockDim.x]);
-  reduceBlock(&sharedmem[4 * blockDim.x]);
-  reduceBlock(&sharedmem[5 * blockDim.x]);
-  ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[(blockIdx.x * gridDim.y + blockIdx.y)] = sharedmem[0];
-    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
-    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
-    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
-    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
-    buffer[(blockIdx.x * gridDim.y + blockIdx.y) + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
-  }
-}
-
-
-__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
-{
-  int i = 0;
-  sharedmem[threadIdx.x] = 0;
-  ENERGY_CFLOAT myforig = 0.0;
-  ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
-  buf = &buf[blockIdx.x * n];
-
-  while(i < n) {
-    sharedmem[threadIdx.x] = 0;
-
-    if(i + threadIdx.x < n)
-      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
-
-    __syncthreads();
-    reduceBlock(sharedmem);
-    i += blockDim.x;
-
-    if(threadIdx.x == 0)
-      myforig += sharedmem[0];
-  }
-
-  if(threadIdx.x == 0)
-    t[blockIdx.x] = myforig;
-}
--- a/lib/cuda/compute_temp_partial_cuda.cu
+++ b/lib/cuda/compute_temp_partial_cuda.cu
@ -1,164 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX compute_temp_partial_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "compute_temp_partial_cuda_cu.h"
-#include "compute_temp_partial_cuda_kernel.cu"
-
-void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int size = (unsigned)((sdata->atom.nlocal + 63) / 64.0) * 6 * sizeof(ENERGY_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass .dev_data, sizeof(V_CFLOAT*));
-
-  if(sdata->atom.rmass_flag)
-    cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
-
-  cudaMemcpyToSymbol(MY_AP(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)       , & sdata->atom.type    .dev_data, sizeof(int*));
-}
-
-void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
-{
-  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  //if(sdata->buffer_new)
-  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    Cuda_ComputeTempPartialCuda_Vector_Kernel <<< grid, threads, threads.x* 6* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
-
-    int oldgrid = grid.x * grid.y;
-    grid.x = 6;
-    threads.x = 512;
-    Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
-  }
-}
-
-void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag)
-{
-  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  //if(sdata->buffer_new)
-  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-  MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n", sdata->atom.nlocal);)
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
-    Cuda_ComputeTempPartialCuda_Scalar_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (groupbit, xflag, yflag, zflag);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
-
-    int oldgrid = grid.x * grid.y;
-    grid.x = 1;
-    threads.x = 512;
-    Cuda_ComputeTempPartialCuda_Reduce_Kernel <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>> (oldgrid, t);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
-  }
-}
-
-void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
-{
-  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  //if(sdata->buffer_new)
-  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
-  }
-}
-
-void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall)
-{
-  //if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-  Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  //if(sdata->buffer_new)
-  Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, (V_CFLOAT*) vbiasall);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
-  }
-}
--- a/lib/cuda/compute_temp_partial_cuda_cu.h
+++ b/lib/cuda/compute_temp_partial_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
-extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit, ENERGY_CFLOAT* t, int xflag, int yflag, int zflag);
-extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
-extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, void* vbiasall);
--- a/lib/cuda/compute_temp_partial_cuda_kernel.cu
+++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu
@ -1,161 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ ENERGY_CFLOAT sharedmem[];
-
-
-__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit, int xflag, int yflag, int zflag)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  sharedmem[threadIdx.x] = 0;
-
-  if(i < _nlocal) {
-    if(_rmass_flag) {
-      if(_mask[i] & groupbit)
-        sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * _rmass[i];
-    } else {
-      if(_mask[i] & groupbit)
-        sharedmem[threadIdx.x] = (_v[i] * _v[i] * xflag + _v[i + _nmax] * _v[i + _nmax] * yflag + _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag) * (_mass[_type[i]]);
-    }
-  }
-
-  reduceBlock(sharedmem);
-  ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
-  }
-}
-
-__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit, int xflag, int yflag, int zflag)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  sharedmem[threadIdx.x] = 0;
-  sharedmem[threadIdx.x + blockDim.x] = 0;
-  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 4 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 5 * blockDim.x] = 0;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      V_CFLOAT massone;
-
-      if(_rmass_flag) massone = _rmass[i];
-      else massone = _mass[_type[i]];
-
-      sharedmem[threadIdx.x] = massone * _v[i] * _v[i] * xflag;
-      sharedmem[threadIdx.x + blockDim.x] = massone * _v[i + _nmax] * _v[i + _nmax] * yflag;
-      sharedmem[threadIdx.x + 2 * blockDim.x] = massone * _v[i + 2 * _nmax] * _v[i + 2 * _nmax] * zflag;
-      sharedmem[threadIdx.x + 3 * blockDim.x] = massone * _v[i] * _v[i + _nmax] * xflag * yflag;
-      sharedmem[threadIdx.x + 4 * blockDim.x] = massone * _v[i] * _v[i + 2 * _nmax] * xflag * zflag;
-      sharedmem[threadIdx.x + 5 * blockDim.x] = massone * _v[i + _nmax] * _v[i + 2 * _nmax] * yflag * zflag;
-    }
-
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2 * blockDim.x]);
-  reduceBlock(&sharedmem[3 * blockDim.x]);
-  reduceBlock(&sharedmem[4 * blockDim.x]);
-  reduceBlock(&sharedmem[5 * blockDim.x]);
-  ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = sharedmem[4 * blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = sharedmem[5 * blockDim.x];
-  }
-}
-
-
-__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n, ENERGY_CFLOAT* t)
-{
-  int i = 0;
-  sharedmem[threadIdx.x] = 0;
-  ENERGY_CFLOAT myforig = 0.0;
-  ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
-  buf = &buf[blockIdx.x * n];
-
-  while(i < n) {
-    sharedmem[threadIdx.x] = 0;
-
-    if(i + threadIdx.x < n)
-      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
-
-    __syncthreads();
-    reduceBlock(sharedmem);
-    i += blockDim.x;
-
-    if(threadIdx.x == 0)
-      myforig += sharedmem[0];
-  }
-
-  if(threadIdx.x == 0)
-    t[blockIdx.x] = myforig;
-}
-
-__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      if(!xflag) {
-        vbiasall[i] = _v[i];
-        _v[i] = V_F(0.0);
-      }
-
-      if(!yflag) {
-        vbiasall[i + _nmax] = _v[i + _nmax];
-        _v[i + _nmax] = V_F(0.0);
-      }
-
-      if(!zflag) {
-        vbiasall[i + 2 * _nmax] = _v[i + 2 * _nmax];
-        _v[i + 2 * _nmax] = V_F(0.0);
-      }
-    }
-}
-
-__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit, int xflag, int yflag, int zflag, V_CFLOAT* vbiasall)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      if(!xflag) {
-        _v[i] += vbiasall[i];
-      }
-
-      if(!yflag) {
-        _v[i + _nmax] += vbiasall[i + _nmax];
-      }
-
-      if(!zflag) {
-        _v[i + 2 * _nmax] += vbiasall[i + 2 * _nmax];
-      }
-    }
-}
--- a/lib/cuda/crm_cuda_utils.cu
+++ b/lib/cuda/crm_cuda_utils.cu
@ -1,919 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef CRM_CUDA_UTILS
-#define CRM_CUDA_UTILS
-
-//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
-inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
-{
-  int3 gridparams;
-  int sharedsize = 16000;
-
-  if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
-
-  if((n < 60 * 32) || (threadsmax < 64))
-    gridparams.z = 32;
-  else if((n < 60 * 64) || (threadsmax < 128))
-    gridparams.z = 64;
-  else if((n < 60 * 128) || (threadsmax < 256))
-    gridparams.z = 128;
-  else if((n < 60 * 256) || (threadsmax < 512))
-    gridparams.z = 256;
-  else gridparams.z = 512;
-
-  if(p2) {
-    gridparams.z = 16;
-
-    while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
-  }
-
-
-  int blocks = (n + gridparams.z - 1) / gridparams.z;
-
-  if(blocks > 10000)
-    gridparams.x = gridparams.y = int(sqrt(blocks));
-  else {
-    gridparams.x = blocks;
-    gridparams.y = 1;
-  }
-
-  while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
-
-  if(gridparams.x == 0) gridparams.x = 1;
-
-  return gridparams;
-}
-
-//return value: 1 if f<0; else: 0
-//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
-static inline __device__ int negativCUDA(float f)
-{
-  return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
-}
-
-//return value: -1 if f<0; else +1
-static inline __device__ float fsignCUDA(float f)
-{
-  return f < 0.0f ? -1.0f : 1.0f;
-}
-
-//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
-//blockDim.y and blockDim.z are assumed to be 1
-static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
-{
-  int i;
-
-  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-//copy data between two memory areas on device, 3d BlockDims are allowed
-static __device__ inline void copyData(double* source, double* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyData(float* source, float* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyData(int* source, int* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
-//in the end in data[0]=sum_i=0^blockDim.x data[i]
-//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
-static __device__ inline void reduceBlockP2(int* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlockP2(unsigned int* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlockP2(float* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlockP2(double* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(float* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(int* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(unsigned int* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(double* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
-{
-  int i;
-
-  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
-    data[i + threadIdx.x] = value;
-  }
-
-  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
-}
-
-static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
-{
-  int i;
-
-  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
-    data[i + threadIdx.x] = value;
-  }
-
-  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
-}
-
-static __device__ inline void reduce(float* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
-    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduce(double* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
-    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void minOfBlock(float* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfBlock(float* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void minOfBlock(double* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfBlock(double* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-
-static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-#if X_PRECISION == 2
-static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
-{
-  int2 v = tex1Dfetch(t, i);
-  return __hiloint2double(v.y, v.x);
-}
-
-static __device__ inline X_CFLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
-{
-  int4 v = tex1Dfetch(t, 2 * i);
-  int4 u = tex1Dfetch(t, 2 * i + 1);
-  X_CFLOAT4 w;
-
-  w.x = __hiloint2double(v.y, v.x);
-  w.y = __hiloint2double(v.w, v.z);
-  w.z = __hiloint2double(u.y, u.x);
-  w.w = __hiloint2double(u.w, u.z);
-  return w;
-}
-#endif
-
-inline void BindXTypeTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _x_type_tex.normalized = false;                      // access with normalized texture coordinates
-  _x_type_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _x_type_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
-
-#if X_PRECISION == 1
-  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_CFLOAT4));
-#else
-  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
-#endif
-#endif
-}
-
-static __device__ inline X_CFLOAT4 fetchXType(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if X_PRECISION == 1
-  return tex1Dfetch(_x_type_tex, i);
-#else
-  return tex1Dfetch_double(_x_type_tex, i);
-#endif
-#else
-  return _x_type[i];
-#endif
-}
-
-#if V_PRECISION == 2
-static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
-{
-  int2 v = tex1Dfetch(t, i);
-  return __hiloint2double(v.y, v.x);
-}
-
-static __device__ inline V_CFLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
-{
-  int4 v = tex1Dfetch(t, 2 * i);
-  int4 u = tex1Dfetch(t, 2 * i + 1);
-  V_CFLOAT4 w;
-
-  w.x = __hiloint2double(v.y, v.x);
-  w.y = __hiloint2double(v.w, v.z);
-  w.z = __hiloint2double(u.y, u.x);
-  w.w = __hiloint2double(u.w, u.z);
-  return w;
-}
-#endif
-
-inline void BindVRadiusTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _v_radius_tex.normalized = false;                      // access with normalized texture coordinates
-  _v_radius_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _v_radius_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
-
-#if V_PRECISION == 1
-  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_CFLOAT4));
-#else
-  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
-#endif
-#endif
-}
-
-static __device__ inline V_CFLOAT4 fetchVRadius(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if V_PRECISION == 1
-  return tex1Dfetch(_v_radius_tex, i);
-#else
-  return tex1Dfetch_double_v(_v_radius_tex, i);
-#endif
-#else
-  return _v_radius[i];
-#endif
-}
-
-inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _omega_rmass_tex.normalized = false;                      // access with normalized texture coordinates
-  _omega_rmass_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
-
-#if V_PRECISION == 1
-  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_CFLOAT4));
-#else
-  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
-#endif
-#endif
-}
-
-static __device__ inline V_CFLOAT4 fetchOmegaRmass(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if V_PRECISION == 1
-  return tex1Dfetch(_omega_rmass_tex, i);
-#else
-  return tex1Dfetch_double_v(_omega_rmass_tex, i);
-#endif
-#else
-  return _omega_rmass[i];
-#endif
-}
-
-#if F_PRECISION == 2
-static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
-{
-  int2 v = tex1Dfetch(t, i);
-  return __hiloint2double(v.y, v.x);
-}
-
-static __device__ inline F_CFLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
-{
-  int4 v = tex1Dfetch(t, 2 * i);
-  int4 u = tex1Dfetch(t, 2 * i + 1);
-  F_CFLOAT4 w;
-
-  w.x = __hiloint2double(v.y, v.x);
-  w.y = __hiloint2double(v.w, v.z);
-  w.z = __hiloint2double(u.y, u.x);
-  w.w = __hiloint2double(u.w, u.z);
-  return w;
-}
-#endif
-
-inline void BindQTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _q_tex.normalized = false;                      // access with normalized texture coordinates
-  _q_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _q_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* q_texture_ptr = &MY_AP(q_tex);
-
-#if F_PRECISION == 1
-  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
-  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_CFLOAT));
-#else
-  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
-  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
-#endif
-#endif
-}
-
-static __device__ inline F_CFLOAT fetchQ(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if F_PRECISION == 1
-  return tex1Dfetch(_q_tex, i);
-#else
-  return tex1Dfetch_double_f(_q_tex, i);
-#endif
-#else
-  return _q[i];
-#endif
-}
-
-#endif
-
-/*
-
-inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
-{
-	#ifdef CUDA_USE_TEXTURE
-		_coeff_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff_texture_ptr;
-		cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
-
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
-		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_CFLOAT4));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
-		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
-		#endif
-	#endif
-}
-
-static __device__ inline X_CFLOAT4 fetchXType(int i)
-{
-		#ifdef CUDA_USE_TEXTURE
-		  #if X_PRECISION == 1
-		     return tex1Dfetch(_x_type_tex,i);
-		  #else
-		     return tex1Dfetch_double(_x_type_tex,i);
-		  #endif
-		#else
-		  return _x_type[i];
-		#endif
-}
-*/
-#define SBBITS 30
-
-static inline __device__ int sbmask(int j)
-{
-  return j >> SBBITS & 3;
-}
-
-static inline __device__ void minimum_image(X_CFLOAT4 &delta)
-{
-  if(_triclinic == 0) {
-    if(_periodicity[0]) {
-      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
-                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
-    }
-
-    if(_periodicity[1]) {
-      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
-                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
-    }
-
-    if(_periodicity[2]) {
-      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
-    }
-
-  } else {
-    if(_periodicity[1]) {
-      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
-      delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
-      delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
-
-    }
-
-    if(_periodicity[1]) {
-      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
-                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
-      delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
-                 (delta.y >  X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
-
-    }
-
-    if(_periodicity[0]) {
-      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
-                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
-    }
-  }
-}
-
-static inline __device__ void closest_image(X_CFLOAT4 &x1, X_CFLOAT4 &x2, X_CFLOAT4 &ci)
-{
-  ci.x = x2.x - x1.x;
-  ci.y = x2.y - x1.y;
-  ci.z = x2.z - x1.z;
-  minimum_image(ci);
-  ci.x += x1.x;
-  ci.y += x1.y;
-  ci.z += x1.z;
-}
--- a/lib/cuda/cuda.cu
+++ b/lib/cuda/cuda.cu
@ -1,22 +0,0 @@
-#include "cuda_precision.h"
-#include "cuda_shared.h"
-#include "cuda_cu.h"
-
-void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
-{
-  sdata->compile_settings.prec_glob = sizeof(CUDA_CFLOAT) / 4;
-  sdata->compile_settings.prec_x = sizeof(X_CFLOAT) / 4;
-  sdata->compile_settings.prec_v = sizeof(V_CFLOAT) / 4;
-  sdata->compile_settings.prec_f = sizeof(F_CFLOAT) / 4;
-  sdata->compile_settings.prec_pppm = sizeof(PPPM_CFLOAT) / 4;
-  sdata->compile_settings.prec_fft = sizeof(FFT_CFLOAT) / 4;
-
-#ifdef FFT_CUFFT
-  sdata->compile_settings.cufft = 1;
-#else
-  sdata->compile_settings.cufft = 0;
-#endif
-
-  sdata->compile_settings.arch = CUDA_ARCH;
-
-}
--- a/lib/cuda/cuda_common.h
+++ b/lib/cuda/cuda_common.h
@ -1,344 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef _CUDA_COMMON_H_
-#define _CUDA_COMMON_H_
-
-//#include "cutil.h"
-#include "cuda_precision.h"
-#include "cuda_wrapper_cu.h"
-
-#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
-//this can not be arbitrarly large, since constant space is limited.
-//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
-//Christian
-#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
-#define CUDA_MAX_NSPECIAL 25
-
-// define some easy-to-use debug and emulation macros
-#ifdef _DEBUG
-#define MYDBG(a) a
-#else
-#define MYDBG(a)
-#endif
-
-#if __DEVICE_EMULATION__
-#define MYEMU(a) a
-#else
-#define MYEMU(a)
-#endif
-
-#define MYEMUDBG(a) MYEMU(MYDBG(a))
-
-// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
-#define MY_ADD_PREFIX(prefix, var) prefix##_##var
-#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
-#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
-
-#define MY_VAR_TO_STR(var) #var
-#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
-//#define &MY_AP(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
-//#define &MY_AP(var) &(MY_AP(var))
-#define CUDA_USE_TEXTURE
-#define CUDA_USE_CFLOAT4
-
-//constants used by many classes
-
-//domain
-#define _boxhi       MY_AP(boxhi)
-#define _boxlo       MY_AP(boxlo)
-#define _subhi       MY_AP(subhi)
-#define _sublo       MY_AP(sublo)
-#define _box_size    MY_AP(box_size)
-#define _prd         MY_AP(prd)
-#define _periodicity MY_AP(periodicity)
-#define _triclinic	 MY_AP(triclinic)
-#define _boxhi_lamda MY_AP(boxhi_lamda)
-#define _boxlo_lamda MY_AP(boxlo_lamda)
-#define _prd_lamda   MY_AP(prd_lamda)
-#define _h		 	 MY_AP(h)
-#define _h_inv	 	 MY_AP(h_inv)
-#define _h_rate		 MY_AP(h_rate)
-__device__ __constant__ X_CFLOAT _boxhi[3];
-__device__ __constant__ X_CFLOAT _boxlo[3];
-__device__ __constant__ X_CFLOAT _subhi[3];
-__device__ __constant__ X_CFLOAT _sublo[3];
-__device__ __constant__ X_CFLOAT _box_size[3];
-__device__ __constant__ X_CFLOAT _prd[3];
-__device__ __constant__ int _periodicity[3];
-__device__ __constant__ int _triclinic;
-__device__ __constant__ X_CFLOAT _boxhi_lamda[3];
-__device__ __constant__ X_CFLOAT _boxlo_lamda[3];
-__device__ __constant__ X_CFLOAT _prd_lamda[3];
-__device__ __constant__ X_CFLOAT _h[6];
-__device__ __constant__ X_CFLOAT _h_inv[6];
-__device__ __constant__ V_CFLOAT _h_rate[6];
-
-
-//atom properties
-#define _x           MY_AP(x)
-#define _v           MY_AP(v)
-#define _f           MY_AP(f)
-#define _tag         MY_AP(tag)
-#define _type        MY_AP(type)
-#define _mask        MY_AP(mask)
-#define _image       MY_AP(image)
-#define _q           MY_AP(q)
-#define _mass        MY_AP(mass)
-#define _rmass       MY_AP(rmass)
-#define _rmass_flag  MY_AP(rmass_flag)
-#define _eatom       MY_AP(eatom)
-#define _vatom       MY_AP(vatom)
-#define _x_type      MY_AP(x_type)
-#define _radius      MY_AP(radius)
-#define _density     MY_AP(density)
-#define _omega       MY_AP(omega)
-#define _torque      MY_AP(torque)
-#define _special     MY_AP(special)
-#define _maxspecial  MY_AP(maxspecial)
-#define _nspecial    MY_AP(nspecial)
-#define _special_flag  MY_AP(special_flag)
-#define _molecule    MY_AP(molecule)
-#define _v_radius    MY_AP(v_radius)
-#define _omega_rmass MY_AP(omega_rmass)
-#define _freeze_group_bit MY_AP(freeze_group_bit)
-#define _map_array   MY_AP(map_array)
-__device__ __constant__ X_CFLOAT* _x;  //holds pointer to positions
-__device__ __constant__ V_CFLOAT* _v;
-__device__ __constant__ F_CFLOAT* _f;
-__device__ __constant__ int* _tag;
-__device__ __constant__ int* _type;
-__device__ __constant__ int* _mask;
-__device__ __constant__ int* _image;
-__device__ __constant__ V_CFLOAT* _mass;
-__device__ __constant__ F_CFLOAT* _q;
-__device__ __constant__ V_CFLOAT* _rmass;
-__device__ __constant__ int _rmass_flag;
-__device__ __constant__ ENERGY_CFLOAT* _eatom;
-__device__ __constant__ ENERGY_CFLOAT* _vatom;
-__device__ __constant__ X_CFLOAT4* _x_type;  //holds pointer to positions
-__device__ __constant__ X_CFLOAT* _radius;
-__device__ __constant__ F_CFLOAT* _density;
-__device__ __constant__ V_CFLOAT* _omega;
-__device__ __constant__ F_CFLOAT* _torque;
-__device__ __constant__ int* _special;
-__device__ __constant__ int _maxspecial;
-__device__ __constant__ int* _nspecial;
-__device__ __constant__ int _special_flag[4];
-__device__ __constant__ int* _molecule;
-__device__ __constant__ V_CFLOAT4* _v_radius;  //holds pointer to positions
-__device__ __constant__ V_CFLOAT4* _omega_rmass;  //holds pointer to positions
-__device__ __constant__ int _freeze_group_bit;
-__device__ __constant__ int* _map_array;
-
-#ifdef CUDA_USE_TEXTURE
-
-#define _x_tex         MY_AP(x_tex)
-#if X_PRECISION == 1
-texture<float> _x_tex;
-#else
-texture<int2, 1> _x_tex;
-#endif
-
-#define _type_tex         MY_AP(type_tex)
-texture<int> _type_tex;
-
-#define _x_type_tex         MY_AP(x_type_tex)
-#if X_PRECISION == 1
-texture<float4, 1> _x_type_tex;
-#else
-texture<int4, 1> _x_type_tex;
-#endif
-
-#define _v_radius_tex         MY_AP(v_radius_tex)
-#if V_PRECISION == 1
-texture<float4, 1> _v_radius_tex;
-#else
-texture<int4, 1> _v_radius_tex;
-#endif
-
-#define _omega_rmass_tex         MY_AP(omega_rmass_tex)
-#if V_PRECISION == 1
-texture<float4, 1> _omega_rmass_tex;
-#else
-texture<int4, 1> _omega_rmass_tex;
-#endif
-
-#define _q_tex         MY_AP(q_tex)
-#if F_PRECISION == 1
-texture<float> _q_tex;
-#else
-texture<int2, 1> _q_tex;
-#endif
-
-#endif
-
-//neighbor
-#ifdef IncludeCommonNeigh
-#define _inum        	MY_AP(inum)
-#define _inum_border    MY_AP(inum_border)
-#define _ilist       	MY_AP(ilist)
-#define _ilist_border 	MY_AP(ilist_border)
-#define _numneigh    	MY_AP(numneigh)
-#define _numneigh_border 	MY_AP(numneigh_border)
-#define _numneigh_inner		MY_AP(numneigh_inner)
-#define _firstneigh  	MY_AP(firstneigh)
-#define _neighbors 	MY_AP(neighbors)
-#define _neighbors_border 	MY_AP(neighbors_border)
-#define _neighbors_inner  	MY_AP(neighbors_inner)
-#define _reneigh_flag 	MY_AP(reneigh_flag)
-#define _triggerneighsq MY_AP(triggerneighsq)
-#define _xhold       	MY_AP(xhold)
-#define _maxhold     	MY_AP(maxhold)
-#define _dist_check     MY_AP(dist_check)
-#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
-#define _maxneighbors   MY_AP(maxneighbors)
-#define _overlap_comm   MY_AP(overlap_comm)
-__device__ __constant__ int _inum;
-__device__ __constant__ int* _inum_border;
-__device__ __constant__ int* _ilist;
-__device__ __constant__ int* _ilist_border;
-__device__ __constant__ int* _numneigh;
-__device__ __constant__ int* _numneigh_border;
-__device__ __constant__ int* _numneigh_inner;
-__device__ __constant__ int** _firstneigh;
-__device__ __constant__ int* _neighbors;
-__device__ __constant__ int* _neighbors_border;
-__device__ __constant__ int* _neighbors_inner;
-__device__ __constant__ int* _reneigh_flag;
-__device__ __constant__ X_CFLOAT _triggerneighsq;
-__device__ __constant__ X_CFLOAT* _xhold;  //holds pointer to positions
-__device__ __constant__ int _maxhold;
-__device__ __constant__ int _dist_check;
-__device__ __constant__ int _neighbor_maxlocal;
-__device__ __constant__ int _maxneighbors;
-__device__ __constant__ int _overlap_comm;
-#endif
-
-//system properties
-#define _nall        MY_AP(nall)
-#define _nghost      MY_AP(nghost)
-#define _nlocal      MY_AP(nlocal)
-#define _nmax        MY_AP(nmax)
-#define _cuda_ntypes MY_AP(cuda_ntypes)
-#define _dtf         MY_AP(dtf)
-#define _dtv         MY_AP(dtv)
-#define _factor      MY_AP(factor)
-#define _virial      MY_AP(virial)
-#define _eng_vdwl    MY_AP(eng_vdwl)
-#define _eng_coul    MY_AP(eng_coul)
-#define _molecular   MY_AP(molecular)
-__device__ __constant__ unsigned _nall;
-__device__ __constant__ unsigned _nghost;
-__device__ __constant__ unsigned _nlocal;
-__device__ __constant__ unsigned _nmax;
-__device__ __constant__ unsigned _cuda_ntypes;
-__device__ __constant__ V_CFLOAT _dtf;
-__device__ __constant__ X_CFLOAT _dtv;
-__device__ __constant__ V_CFLOAT _factor;
-__device__ __constant__ ENERGY_CFLOAT* _virial;
-__device__ __constant__ ENERGY_CFLOAT* _eng_vdwl;
-__device__ __constant__ ENERGY_CFLOAT* _eng_coul;
-__device__ __constant__ int _molecular;
-
-//other general constants
-#define _buffer      MY_AP(buffer)
-#define _flag		 MY_AP(flag)
-#define _debugdata   MY_AP(debugdata)
-__device__ __constant__ void* _buffer;
-__device__ __constant__ int* _flag;
-__device__ __constant__ int* _debugdata;
-
-// pointers to data fields on GPU are hold in constant space
-// -> reduces register usage and number of parameters for kernelcalls
-// will be variables of file scope in cuda files
-
-
-
-
-// maybe used to output cudaError_t
-#define MY_OUTPUT_RESULT(result) \
-  switch(result) \
-  { \
-  case cudaSuccess: printf(" => cudaSuccess\n"); break; \
-  case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
-  case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
-  case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
-  case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
-  default: printf(" => unknown\n"); break; \
-  }
-
-#ifdef _DEBUG
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-      exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    err = cudaThreadSynchronize();                                           \
-    if( cudaSuccess != err) {                                                \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-      exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-  }
-#else
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-              errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-      exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-  }
-#endif
-
-#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-              __FILE__, __LINE__, cudaGetErrorString( err) );              \
-      exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
-
-#define X_MASK 1
-#define V_MASK 2
-#define F_MASK 4
-#define TAG_MASK 8
-#define TYPE_MASK 16
-#define MASK_MASK 32
-#define IMAGE_MASK 64
-#define Q_MASK 128
-#define MOLECULE_MASK 256
-#define RMASS_MASK 512
-#define RADIUS_MASK 1024
-#define DENSITY_MASK 2048
-#define OMEGA_MASK 4096
-#define TORQUE_MASK 8192
-
-
-
-#endif // #ifdef _CUDA_COMMON_H_
--- a/lib/cuda/cuda_cu.h
+++ b/lib/cuda/cuda_cu.h
@ -1 +0,0 @@
-extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);
--- a/lib/cuda/cuda_data.cu
+++ b/lib/cuda/cuda_data.cu
@ -1,220 +0,0 @@
-enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
-
-#include "cuda_data_cu.h"
-#include "cuda_wrapper_cu.h"
-#include "cuda_data_kernel.cu"
-#include <cstdio>
-
-void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
-{
-  int size = n[0];
-
-  if(n[1] > 0) size *= n[1];
-
-  if(n[2] > 0) size *= n[2];
-
-  dim3 threads;
-  threads.x = 1;
-  threads.y = 1;
-  threads.z = 1;
-  dim3 grid;
-  grid.x = 1;
-  grid.y = 1;
-  grid.z = 1;
-
-  if(size <= 128 * 30)
-    threads.x = 32;
-  else if(size <= 256 * 30)
-    threads.x = 64;
-  else if(size <= 512 * 30)
-    threads.x = 128;
-  else
-    threads.x = 256;
-
-  grid.x = ((size - 1) + threads.x) / threads.x;
-
-  if(grid.x > 32000)
-    grid.x = 32000;
-
-  while(grid.x * grid.y * threads.x < size) grid.y++;
-
-  float debugdata[size];
-  //int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
-  size *= sizeof(double);
-  printf("size: %i (%i %i %i) (%i %i %i) %p\n", size, grid.x, grid.y, threads.x, n[0], n[1], n[2], buffer);
-  CudaWrapper_UploadCudaData(host_data, buffer, size);
-  CudaData_Upload_Kernel_DoubleFloat <<< grid, threads>>>((double*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
-  cudaThreadSynchronize();
-  CudaWrapper_DownloadCudaData(debugdata, dev_data, size / 2);
-  double sum = 0;
-  printf("debugdata: ");
-
-  for(int i = 0; i < size / sizeof(double); i++) sum += (debugdata[i] - ((double*) host_data)[i]) * (debugdata[i] - ((double*) host_data)[i]);
-
-  printf("%lf \n", sum);
-
-}
-
-void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
-{
-  int size = n[0];
-
-  if(n[1] > 0) size *= n[1];
-
-  if(n[2] > 0) size *= n[2];
-
-  dim3 threads;
-  threads.x = 1;
-  threads.y = 1;
-  threads.z = 1;
-  dim3 grid;
-  grid.x = 1;
-  grid.y = 1;
-  grid.z = 1;
-
-  if(size <= 128 * 30)
-    threads.x = 32;
-  else if(size <= 256 * 30)
-    threads.x = 64;
-  else if(size <= 512 * 30)
-    threads.x = 128;
-  else
-    threads.x = 256;
-
-  grid.x = ((size - 1) + threads.x) / threads.x;
-
-  if(grid.x > 32000)
-    grid.x = 32000;
-
-  while(grid.x * grid.y * threads.x < size) grid.y++;
-
-  size *= sizeof(double);
-
-  CudaWrapper_UploadCudaData(host_data, buffer, size);
-  CudaData_Upload_Kernel_DoubleDouble <<< grid, threads>>>((double*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
-  cudaThreadSynchronize();
-}
-
-void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
-{
-  int size = n[0];
-
-  if(n[1] > 0) size *= n[1];
-
-  if(n[2] > 0) size *= n[2];
-
-  dim3 threads;
-  threads.x = 1;
-  threads.y = 1;
-  threads.z = 1;
-  dim3 grid;
-  grid.x = 1;
-  grid.y = 1;
-  grid.z = 1;
-
-  if(size <= 128 * 30)
-    threads.x = 32;
-  else if(size <= 256 * 30)
-    threads.x = 64;
-  else if(size <= 512 * 30)
-    threads.x = 128;
-  else
-    threads.x = 256;
-
-  grid.x = ((size - 1) + threads.x) / threads.x;
-
-  if(grid.x > 32000)
-    grid.x = 32000;
-
-  while(grid.x * grid.y * threads.x < size) grid.y++;
-
-  size *= sizeof(float);
-
-  CudaWrapper_UploadCudaData(host_data, buffer, size);
-  CudaData_Upload_Kernel_FloatDouble <<< grid, threads>>>((float*)buffer, (double*)dev_data, n[0], n[1], n[2], mode);
-  cudaThreadSynchronize();
-}
-
-void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
-{
-  int size = n[0];
-
-  if(n[1] > 0) size *= n[1];
-
-  if(n[2] > 0) size *= n[2];
-
-  dim3 threads;
-  threads.x = 1;
-  threads.y = 1;
-  threads.z = 1;
-  dim3 grid;
-  grid.x = 1;
-  grid.y = 1;
-  grid.z = 1;
-
-  if(size <= 128 * 30)
-    threads.x = 32;
-  else if(size <= 256 * 30)
-    threads.x = 64;
-  else if(size <= 512 * 30)
-    threads.x = 128;
-  else
-    threads.x = 256;
-
-  grid.x = ((size - 1) + threads.x) / threads.x;
-
-  if(grid.x > 32000)
-    grid.x = 32000;
-
-  while(grid.x * grid.y * threads.x < size) grid.y++;
-
-  size *= sizeof(float);
-
-  CudaWrapper_UploadCudaData(host_data, buffer, size);
-  CudaData_Upload_Kernel_FloatFloat <<< grid, threads>>>((float*)buffer, (float*)dev_data, n[0], n[1], n[2], mode);
-  cudaThreadSynchronize();
-}
-
-void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer)
-{
-  int size = n[0];
-
-  if(n[1] > 0) size *= n[1];
-
-  if(n[2] > 0) size *= n[2];
-
-  dim3 threads;
-  threads.x = 1;
-  threads.y = 1;
-  threads.z = 1;
-  dim3 grid;
-  grid.x = 1;
-  grid.y = 1;
-  grid.z = 1;
-
-  if(size <= 128 * 30)
-    threads.x = 32;
-  else if(size <= 256 * 30)
-    threads.x = 64;
-  else if(size <= 512 * 30)
-    threads.x = 128;
-  else
-    threads.x = 256;
-
-  grid.x = ((size - 1) + threads.x) / threads.x;
-
-  if(grid.x > 32000)
-    grid.x = 32000;
-
-  while(grid.x * grid.y * threads.x < size) grid.y++;
-
-  size *= sizeof(int);
-
-  CudaWrapper_UploadCudaData(host_data, buffer, size);
-  CudaData_Upload_Kernel_IntInt <<< grid, threads>>>((int*)buffer, (int*)dev_data, n[0], n[1], n[2], mode);
-  cudaThreadSynchronize();
-}
-
-void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer)
-{
-}
--- a/lib/cuda/cuda_data_cu.h
+++ b/lib/cuda/cuda_data_cu.h
@ -1,13 +0,0 @@
-#ifndef CUDA_DATA_CU_H_
-#define CUDA_DATA_CU_H_
-
-extern "C" void CudaData_Upload_DoubleFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
-extern "C" void CudaData_Upload_DoubleDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
-extern "C" void CudaData_Upload_FloatDouble(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
-extern "C" void CudaData_Upload_FloatFloat(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
-extern "C" void CudaData_Upload_IntInt(void* host_data, void* dev_data, unsigned* n, copy_mode mode, void* buffer);
-
-extern "C" void CudaData_Download(void* host_data, void* dev_data, int host_size, int dev_size, unsigned* n, copy_mode mode, void* buffer);
-
-
-#endif /*CUDA_DATA_CU_H_*/
--- a/lib/cuda/cuda_data_kernel.cu
+++ b/lib/cuda/cuda_data_kernel.cu
@ -1,195 +0,0 @@
-__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer, float* dev_data,
-    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
-{
-  if(mode == x) mode = xx;
-
-  unsigned length = nx;
-
-  if(ny > 0) length *= ny;
-
-  if(nz > 0) length *= nz;
-
-  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
-
-
-  if(i >= length) return;
-
-  switch(mode) {
-    case xx: {
-      dev_data[i] = buffer[i];
-    }
-
-    case xy: {
-      dev_data[i] = buffer[i];
-    }
-
-    case yx: {
-      j = i / ny;
-      k = i % ny;
-      dev_data[k * nx + j] = buffer[j * ny + k];
-    }
-
-    case xyz: {
-      dev_data[i] = buffer[i];
-    }
-
-    case xzy: {
-      j = i / (ny * nz);
-      k = (i % (ny * nz)) / nz;
-      l = i % nz;
-      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
-    }
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer, double* dev_data,
-    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
-{
-  if(mode == x) mode = xx;
-
-  unsigned length = nx;
-
-  if(ny > 0) length *= ny;
-
-  if(nz > 0) length *= nz;
-
-  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
-
-  if(i >= length) return;
-
-  switch(mode) {
-    case xx:
-      dev_data[i] = buffer[i];
-
-    case xy:
-      dev_data[i] = buffer[i];
-
-    case yx:
-      j = i / ny;
-      k = i % ny;
-      dev_data[k * nx + j] = buffer[j * ny + k];
-
-    case xyz:
-      dev_data[i] = buffer[i];
-
-    case xzy:
-      j = i / (ny * nz);
-      k = (i % (ny * nz)) / nz;
-      l = i % nz;
-      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer, double* dev_data,
-    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
-{
-  if(mode == x) mode = xx;
-
-  unsigned length = nx;
-
-  if(ny > 0) length *= ny;
-
-  if(nz > 0) length *= nz;
-
-  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
-
-  if(i >= length) return;
-
-  switch(mode) {
-    case xx:
-      dev_data[i] = buffer[i];
-
-    case xy:
-      dev_data[i] = buffer[i];
-
-    case yx:
-      j = i / ny;
-      k = i % ny;
-      dev_data[k * nx + j] = buffer[j * ny + k];
-
-    case xyz:
-      dev_data[i] = buffer[i];
-
-    case xzy:
-      j = i / (ny * nz);
-      k = (i % (ny * nz)) / nz;
-      l = i % nz;
-      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer, float* dev_data,
-    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
-{
-  if(mode == x) mode = xx;
-
-  unsigned length = nx;
-
-  if(ny > 0) length *= ny;
-
-  if(nz > 0) length *= nz;
-
-  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
-
-  if(i >= length) return;
-
-  switch(mode) {
-    case xx:
-      dev_data[i] = buffer[i];
-
-    case xy:
-      dev_data[i] = buffer[i];
-
-    case yx:
-      j = i / ny;
-      k = i % ny;
-      dev_data[k * nx + j] = buffer[j * ny + k];
-
-    case xyz:
-      dev_data[i] = buffer[i];
-
-    case xzy:
-      j = i / (ny * nz);
-      k = (i % (ny * nz)) / nz;
-      l = i % nz;
-      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_IntInt(int* buffer, int* dev_data,
-    unsigned nx, unsigned ny, unsigned nz, copy_mode mode)
-{
-  if(mode == x) mode = xx;
-
-  unsigned length = nx;
-
-  if(ny > 0) length *= ny;
-
-  if(nz > 0) length *= nz;
-
-  unsigned i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x, j, k, l;
-
-  if(i >= length) return;
-
-  switch(mode) {
-    case xx:
-      dev_data[i] = buffer[i];
-
-    case xy:
-      dev_data[i] = buffer[i];
-
-    case yx:
-      j = i / ny;
-      k = i % ny;
-      dev_data[k * nx + j] = buffer[j * ny + k];
-
-    case xyz:
-      dev_data[i] = buffer[i];
-
-    case xzy:
-      j = i / (ny * nz);
-      k = (i % (ny * nz)) / nz;
-      l = i % nz;
-      dev_data[j * ny * nz + l * ny + k] = buffer[j * ny * nz + k * nz + l];
-  }
-}
--- a/lib/cuda/cuda_kernel.cu
+++ b/lib/cuda/cuda_kernel.cu
--- a/lib/cuda/cuda_pair.cu
+++ b/lib/cuda/cuda_pair.cu
--- a/lib/cuda/cuda_pair_cu.h
+++ b/lib/cuda/cuda_pair_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-#include "cuda_shared.h"
-
-extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata, int eflag, int vflag);
--- a/lib/cuda/cuda_pair_kernel.cu
+++ b/lib/cuda/cuda_pair_kernel.cu
--- a/lib/cuda/cuda_pair_virial_kernel_nc.cu
+++ b/lib/cuda/cuda_pair_virial_kernel_nc.cu
@ -1,126 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ ENERGY_CFLOAT sharedmem[];
-
-static inline __device__ void PairVirialCompute_A_Kernel(int eflag, int vflag, int coulflag = 0)
-{
-  __syncthreads();
-  ENERGY_CFLOAT* shared = sharedmem;
-
-  if(eflag) {
-    reduceBlock(shared);
-    shared += blockDim.x;
-
-    if(coulflag) {
-      reduceBlock(shared);
-      shared += blockDim.x;
-    }
-  }
-
-  if(vflag) {
-    reduceBlock(shared + 0 * blockDim.x);
-    reduceBlock(shared + 1 * blockDim.x);
-    reduceBlock(shared + 2 * blockDim.x);
-    reduceBlock(shared + 3 * blockDim.x);
-    reduceBlock(shared + 4 * blockDim.x);
-    reduceBlock(shared + 5 * blockDim.x);
-  }
-
-  if(threadIdx.x == 0) {
-    shared = sharedmem;
-    ENERGY_CFLOAT* buffer = (ENERGY_CFLOAT*) _buffer;
-
-    if(eflag) {
-      buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
-      shared += blockDim.x;
-      buffer += gridDim.x * gridDim.y;
-
-      if(coulflag) {
-        buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
-        shared += blockDim.x;
-        buffer += gridDim.x * gridDim.y;
-      }
-    }
-
-    if(vflag) {
-      buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x];
-      buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x];
-      buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x];
-      buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x];
-      buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x];
-      buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x];
-    }
-  }
-
-  __syncthreads();
-}
-
-__global__ void MY_AP(PairVirialCompute_reduce)(int n)
-{
-  sharedmem[threadIdx.x] = ENERGY_F(0.0);
-  ENERGY_CFLOAT sum = ENERGY_F(0.0);
-  ENERGY_CFLOAT* buf = (ENERGY_CFLOAT*) _buffer;
-  buf = &buf[blockIdx.x * n];
-  //if(blockIdx.x==2) buf=&buf[n];
-
-  for(int i = 0; i < n; i += blockDim.x) {
-    sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0);
-    __syncthreads();
-    reduceBlock(sharedmem);
-
-    if(threadIdx.x == 0) sum += sharedmem[0];
-  }
-
-  if(threadIdx.x == 0) {
-    if(gridDim.x == 1) { //evdwl
-      _eng_vdwl[0] += sum;
-    }
-
-    if(gridDim.x == 2) { //evdwl + ecoul only
-      if(blockIdx.x == 0)
-        _eng_vdwl[0] += sum;
-      else
-        _eng_coul[0] += sum;
-    }
-
-    if(gridDim.x == 6) { //virial
-      _virial[blockIdx.x] += sum;
-    }
-
-    if(gridDim.x == 7) { //evdwl+virial
-      if(blockIdx.x == 0)
-        _eng_vdwl[0] += sum;
-      else _virial[blockIdx.x - 1] += sum;
-    }
-
-    if(gridDim.x == 8) { //evdwl+ecoul+virial
-      if(blockIdx.x == 0)
-        _eng_vdwl[0] += sum;
-      else if(blockIdx.x == 1)
-        _eng_coul[0] += sum;
-      else
-        _virial[blockIdx.x - 2] += sum;
-    }
-  }
-}
--- a/lib/cuda/cuda_precision.h
+++ b/lib/cuda/cuda_precision.h
@ -1,278 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef CUDA_PRECISION_H_
-#define CUDA_PRECISION_H_
-/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
- * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
- * ***_CFLOAT: type definition of given property
- * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
- */
-
-#ifdef CUDA_USE_BINNING
-#define CUDA_IF_BINNING(a) a
-#else
-#define CUDA_IF_BINNING(a)
-#endif
-
-//GLOBAL
-
-#ifdef CUDA_PRECISION
-#if CUDA_PRECISION == 1
-#define CUDA_CFLOAT float
-#define CUDA_F(x) x##f
-#endif
-#if CUDA_PRECISION == 2
-#define CUDA_CFLOAT double
-#define CUDA_F(x) x
-#endif
-#endif
-
-#ifndef CUDA_PRECISION
-#define CUDA_CFLOAT double
-#define CUDA_F(x) x
-#define CUDA_PRECISION 2
-#endif
-//--------------------------------
-//-----------FFT-----------------
-//--------------------------------
-
-#ifdef FFT_PRECISION_CU
-#if FFT_PRECISION_CU == 1
-#define FFT_CFLOAT float
-#define FFT_F(x) x##f
-#endif
-#if FFT_PRECISION_CU == 2
-#define FFT_CFLOAT double
-#define FFT_F(x) x
-#endif
-#endif
-
-#ifndef FFT_PRECISION_CU
-#define FFT_CFLOAT CUDA_CFLOAT
-#define FFT_F(x) CUDA_F(x)
-#define FFT_PRECISION_CU CUDA_PRECISION
-#endif
-
-//--------------------------------
-//-----------PPPM-----------------
-//--------------------------------
-
-#ifndef PPPM_PRECISION
-#define PPPM_PRECISION CUDA_PRECISION
-#endif
-
-#ifdef PPPM_PRECISION
-#if PPPM_PRECISION == 1
-#define PPPM_CFLOAT float
-#ifdef float3
-#define PPPM_CFLOAT3 float3
-#else
-struct PPPM_CFLOAT3 {
-  PPPM_CFLOAT x;
-  PPPM_CFLOAT y;
-  PPPM_CFLOAT z;
-};
-#endif
-#define PPPM_F(x) x##f
-#endif
-#if PPPM_PRECISION == 2
-#define PPPM_CFLOAT double
-struct PPPM_CFLOAT3 {
-  PPPM_CFLOAT x;
-  PPPM_CFLOAT y;
-  PPPM_CFLOAT z;
-};
-#define PPPM_F(x) x
-#endif
-#endif
-
-
-//--------------------------------
-//-----------FORCE-----------------
-//--------------------------------
-
-
-#ifdef F_PRECISION
-#if F_PRECISION == 1
-#define F_CFLOAT float
-#define F_F(x) x##f
-#endif
-#if F_PRECISION == 2
-#define F_CFLOAT double
-#define F_F(x) x
-#endif
-#endif
-
-#ifndef F_PRECISION
-#define F_CFLOAT CUDA_CFLOAT
-#define F_F(x) CUDA_F(x)
-#define F_PRECISION CUDA_PRECISION
-#endif
-
-#if F_PRECISION == 1
-#define _SQRT_ sqrtf
-#define _RSQRT_ rsqrtf
-#define _EXP_ expf
-#else
-#define _SQRT_ sqrt
-#define _RSQRT_ rsqrt
-#define _EXP_ exp
-#endif
-
-#if F_PRECISION == 2
-struct F_CFLOAT2 {
-  F_CFLOAT x;
-  F_CFLOAT y;
-};
-struct F_CFLOAT3 {
-  F_CFLOAT x;
-  F_CFLOAT y;
-  F_CFLOAT z;
-};
-struct F_CFLOAT4 {
-  F_CFLOAT x;
-  F_CFLOAT y;
-  F_CFLOAT z;
-  F_CFLOAT w;
-};
-#else
-#define F_CFLOAT2 float2
-#define F_CFLOAT3 float3
-#define F_CFLOAT4 float4
-#endif
-
-//--------------------------------
-//-----------ENERGY-----------------
-//--------------------------------
-
-#ifndef ENERGY_PRECISION
-#define ENERGY_CFLOAT CUDA_CFLOAT
-#define ENERGY_F(x) CUDA_F(x)
-#endif
-
-#ifdef ENERGY_PRECISION
-#if ENERGY_PRECISION == 1
-#define ENERGY_CFLOAT float
-#define ENERGY_F(x) x##f
-#endif
-#if ENERGY_PRECISION == 2
-#define ENERGY_CFLOAT double
-#define ENERGY_F(x) x
-#endif
-#endif
-
-#ifndef ENERGY_PRECISION
-#define ENERGY_CFLOAT CUDA_CFLOAT
-#define ENERGY_F(x) CUDA_F(x)
-#define ENERGY_PRECISION CUDA_PRECISION
-#endif
-
-//--------------------------------
-//-----------POSITIONS------------
-//--------------------------------
-
-#ifdef X_PRECISION
-#if X_PRECISION == 1
-#define X_CFLOAT float
-#define X_F(x) x##f
-#endif
-#if X_PRECISION == 2
-#define X_CFLOAT double
-#define X_F(x) x
-#endif
-#endif
-
-#ifndef X_PRECISION
-#define X_CFLOAT CUDA_CFLOAT
-#define X_F(x) CUDA_F(x)
-#define X_PRECISION CUDA_PRECISION
-#endif
-
-#if X_PRECISION == 2
-struct X_CFLOAT2 {
-  X_CFLOAT x;
-  X_CFLOAT y;
-};
-struct X_CFLOAT3 {
-  X_CFLOAT x;
-  X_CFLOAT y;
-  X_CFLOAT z;
-};
-struct X_CFLOAT4 {
-  X_CFLOAT x;
-  X_CFLOAT y;
-  X_CFLOAT z;
-  X_CFLOAT w;
-};
-#else
-#define X_CFLOAT2 float2
-#define X_CFLOAT3 float3
-#define X_CFLOAT4 float4
-#endif
-
-//--------------------------------
-//-----------velocities-----------
-//--------------------------------
-
-#ifdef V_PRECISION
-#if V_PRECISION == 1
-#define V_CFLOAT float
-#define V_F(x) x##f
-#endif
-#if V_PRECISION == 2
-#define V_CFLOAT double
-#define V_F(x) x
-#endif
-#endif
-
-#ifndef V_PRECISION
-#define V_CFLOAT CUDA_CFLOAT
-#define V_F(x) CUDA_F(x)
-#define V_PRECISION CUDA_PRECISION
-#endif
-
-#if V_PRECISION == 2
-struct V_CFLOAT4 {
-  V_CFLOAT x;
-  V_CFLOAT y;
-  V_CFLOAT z;
-  V_CFLOAT w;
-};
-#else
-#define V_CFLOAT4 float4
-#endif
-
-#ifdef NO_PREC_TIMING
-struct my_times {
-  unsigned int tv_sec;
-  unsigned int tv_nsec;
-};
-
-#define my_gettime(a,b)
-#else
-#define my_times timespec
-#define my_gettime(a,b) clock_gettime(a,b)
-#endif
-
-#endif /*CUDA_PRECISION_H_*/
--- a/lib/cuda/cuda_shared.h
+++ b/lib/cuda/cuda_shared.h
@ -1,370 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef _CUDA_SHARED_H_
-#define _CUDA_SHARED_H_
-#include "cuda_precision.h"
-
-#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
-
-struct dev_array {
-  void* dev_data;			// pointer to memory address on cuda device
-  unsigned dim[3];		// array dimensions
-};
-
-struct cuda_shared_atom {	// relevent data from atom class
-  dev_array dx; 			// cumulated distance for binning settings
-  dev_array x;			// position
-  dev_array v;			// velocity
-  dev_array f;			// force
-  dev_array tag;
-  dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)
-  dev_array mask;
-  dev_array image;
-  dev_array q;			// charges
-  dev_array mass;			// per-type masses
-  dev_array rmass;		// per-atom masses
-  dev_array radius;		// per-atom radius
-  dev_array density;
-  dev_array omega;
-  dev_array torque;
-  dev_array molecule;
-
-  dev_array special;
-  int maxspecial;
-  dev_array nspecial;
-  int* special_flag;
-  int molecular;
-
-  dev_array eatom;		// per-atom energy
-  dev_array vatom;		// per-atom virial
-  int need_eatom;
-  int need_vatom;
-
-  dev_array x_type;		// position + type in X_CFLOAT4 struct
-  dev_array v_radius;		// velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
-  dev_array omega_rmass;		// velociyt + radius in V_CFLOAT4 struct currently only used for granular atom_style
-
-  double* mass_host;		// remember per-type host pointer to masses
-  //int natoms;				// total # of atoms in system, could be 0
-  int nghost;				// and ghost atoms on this proc
-  int nlocal;				// # of owned
-  int nall;			    // total # of atoms in this proc
-  int nmax;				// max # of owned+ghost in arrays on this proc
-  int ntypes;
-  int q_flag;				// do we have charges?
-  int rmass_flag;			// do we have per-atom masses?
-  int firstgroup;
-  int nfirst;
-
-  int update_nlocal;
-  int update_nmax;
-  int update_neigh;
-
-  dev_array xhold;	    // position at last neighboring
-  X_CFLOAT triggerneighsq;		// maximum square movement before reneighboring
-  int reneigh_flag;		// is reneighboring necessary
-  int maxhold;			// size of xhold
-  int dist_check; 		//perform distance check for reneighboring
-  dev_array binned_id;    //id of each binned atom (not tag!!)
-  dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
-  float bin_extraspace;
-  int bin_dim[3];
-  int bin_nmax;
-  dev_array map_array;
-};
-
-struct cuda_shared_pair {	// relevent data from pair class
-  char cudable_force;		// check for (cudable_force!=0)
-  X_CFLOAT cut_global;
-  X_CFLOAT cut_inner_global;
-  X_CFLOAT cut_coul_global;
-  double** cut;			// type-type cutoff
-  double** cutsq;			// type-type cutoff
-  double** cut_inner;			// type-type cutoff for coul
-  double** cut_coul;			// type-type cutoff for coul
-  double** coeff1;		// tpye-type pair parameters
-  double** coeff2;
-  double** coeff3;
-  double** coeff4;
-  double** coeff5;
-  double** coeff6;
-  double** coeff7;
-  double** coeff8;
-  double** coeff9;
-  double** coeff10;
-  double** offset;
-  double* special_lj;
-  double* special_coul;
-  dev_array virial; // ENERGY_CFLOAT
-  dev_array eng_vdwl; // ENERGY_CFLOAT
-  dev_array eng_coul; // ENERGY_CFLOAT
-  X_CFLOAT cut_coulsq_global;
-  F_CFLOAT g_ewald, kappa;
-  int freeze_group_bit;
-
-  dev_array coeff1_gm;
-  dev_array coeff2_gm;
-  dev_array coeff3_gm;
-  dev_array coeff4_gm;
-  dev_array coeff5_gm;
-  dev_array coeff6_gm;
-  dev_array coeff7_gm;
-  dev_array coeff8_gm;
-  dev_array coeff9_gm;
-  dev_array coeff10_gm;
-
-  int lastgridsize;
-  int n_energy_virial;
-  int collect_forces_later;
-  int use_block_per_atom;
-  int override_block_per_atom;
-  bool neighall;
-
-};
-
-struct cuda_shared_domain {	// relevent data from domain class
-  X_CFLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
-  X_CFLOAT subhi[3];
-  X_CFLOAT boxlo[3];
-  X_CFLOAT boxhi[3];
-  X_CFLOAT prd[3];
-  int periodicity[3];		// xyz periodicity as array
-
-  int triclinic;
-  X_CFLOAT xy;
-  X_CFLOAT xz;
-  X_CFLOAT yz;
-  X_CFLOAT boxlo_lamda[3];
-  X_CFLOAT boxhi_lamda[3];
-  X_CFLOAT prd_lamda[3];
-  X_CFLOAT h[6];
-  X_CFLOAT h_inv[6];
-  V_CFLOAT h_rate[6];
-  int update;
-};
-
-struct cuda_shared_pppm {
-  char cudable_force;
-#ifdef FFT_CUFFT
-  FFT_CFLOAT* work1;
-  FFT_CFLOAT* work2;
-  FFT_CFLOAT* work3;
-  PPPM_CFLOAT* greensfn;
-  PPPM_CFLOAT* fkx;
-  PPPM_CFLOAT* fky;
-  PPPM_CFLOAT* fkz;
-  PPPM_CFLOAT* vg;
-#endif
-  int* part2grid;
-  PPPM_CFLOAT* density_brick;
-  int* density_brick_int;
-  PPPM_CFLOAT density_intScale;
-  PPPM_CFLOAT* vdx_brick;
-  PPPM_CFLOAT* vdy_brick;
-  PPPM_CFLOAT* vdz_brick;
-  PPPM_CFLOAT* density_fft;
-  ENERGY_CFLOAT* energy;
-  ENERGY_CFLOAT* virial;
-  int nxlo_in;
-  int nxhi_in;
-  int nxlo_out;
-  int nxhi_out;
-  int nylo_in;
-  int nyhi_in;
-  int nylo_out;
-  int nyhi_out;
-  int nzlo_in;
-  int nzhi_in;
-  int nzlo_out;
-  int nzhi_out;
-  int nx_pppm;
-  int ny_pppm;
-  int nz_pppm;
-  PPPM_CFLOAT qqrd2e;
-  int order;
-  // float3 sublo;
-  PPPM_CFLOAT* rho_coeff;
-  int nmax;
-  int nlocal;
-  PPPM_CFLOAT* debugdata;
-  PPPM_CFLOAT delxinv;
-  PPPM_CFLOAT delyinv;
-  PPPM_CFLOAT delzinv;
-  int nlower;
-  int nupper;
-  PPPM_CFLOAT shiftone;
-  PPPM_CFLOAT3* fH;
-};
-
-struct cuda_shared_comm {
-  int maxswap;
-  int maxlistlength;
-  dev_array pbc;
-  dev_array slablo;
-  dev_array slabhi;
-  dev_array multilo;
-  dev_array multihi;
-  dev_array sendlist;
-  int grow_flag;
-  int comm_phase;
-
-  int nsend;
-  int* nsend_swap;
-  int* send_size;
-  int* recv_size;
-  double** buf_send;
-  void** buf_send_dev;
-  double** buf_recv;
-  void** buf_recv_dev;
-  void* buffer;
-  int buffer_size;
-  double overlap_split_ratio;
-};
-
-struct cuda_shared_neighlist { // member of CudaNeighList, has no instance in cuda_shared_data
-  int maxlocal;
-  int inum;                // # of I atoms neighbors are stored for local indices of I atoms
-  int inum_border2;
-  dev_array inum_border;         // # of atoms which interact with border atoms
-  dev_array ilist;
-  dev_array ilist_border;
-  dev_array numneigh;
-  dev_array numneigh_inner;
-  dev_array numneigh_border;
-  dev_array firstneigh;
-  dev_array neighbors;
-  dev_array neighbors_border;
-  dev_array neighbors_inner;
-  int maxpage;
-  dev_array page_pointers;
-  dev_array* pages;
-  int maxneighbors;
-  int neigh_lists_per_page;
-  double** cutneighsq;
-  CUDA_CFLOAT* cu_cutneighsq;
-  int* binned_id;
-  int* bin_dim;
-  int bin_nmax;
-  float bin_extraspace;
-  double maxcut;
-  dev_array ex_type;
-  int nex_type;
-  dev_array ex1_bit;
-  dev_array ex2_bit;
-  int nex_group;
-  dev_array ex_mol_bit;
-  int nex_mol;
-
-};
-
-struct cuda_compile_settings {	// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
-  int prec_glob;
-  int prec_x;
-  int prec_v;
-  int prec_f;
-  int prec_pppm;
-  int prec_fft;
-  int cufft;
-  int arch;
-};
-
-struct cuda_timings_struct {
-  //Debug:
-  double test1;
-  double test2;
-  //transfers
-  double transfer_upload_tmp_constr;
-  double transfer_download_tmp_deconstr;
-
-  //communication
-  double comm_forward_total;
-  double comm_forward_mpi_upper;
-  double comm_forward_mpi_lower;
-  double comm_forward_kernel_pack;
-  double comm_forward_kernel_unpack;
-  double comm_forward_kernel_self;
-  double comm_forward_upload;
-  double comm_forward_download;
-
-  double comm_exchange_total;
-  double comm_exchange_mpi;
-  double comm_exchange_kernel_pack;
-  double comm_exchange_kernel_unpack;
-  double comm_exchange_kernel_fill;
-  double comm_exchange_cpu_pack;
-  double comm_exchange_upload;
-  double comm_exchange_download;
-
-  double comm_border_total;
-  double comm_border_mpi;
-  double comm_border_kernel_pack;
-  double comm_border_kernel_unpack;
-  double comm_border_kernel_self;
-  double comm_border_kernel_buildlist;
-  double comm_border_upload;
-  double comm_border_download;
-
-  //pair forces
-  double pair_xtype_conversion;
-  double pair_kernel;
-  double pair_virial;
-  double pair_force_collection;
-
-  //neighbor
-  double neigh_bin;
-  double neigh_build;
-  double neigh_special;
-
-  //PPPM
-  double pppm_particle_map;
-  double pppm_make_rho;
-  double pppm_brick2fft;
-  double pppm_poisson;
-  double pppm_fillbrick;
-  double pppm_fieldforce;
-  double pppm_compute;
-
-};
-
-struct cuda_shared_data {	// holds space for all relevent data from the different classes
-  void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
-  int buffersize; //maxsize of buffer
-  int buffer_new; //should be 1 if the pointer to buffer has changed
-  void* flag;
-  void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
-  cuda_shared_atom atom;
-  cuda_shared_pair pair;
-  cuda_shared_domain domain;
-  cuda_shared_pppm pppm;
-  cuda_shared_comm comm;
-  cuda_compile_settings compile_settings;
-  cuda_timings_struct cuda_timings;
-  int exchange_dim;
-  int me; //mpi rank
-  unsigned int datamask;
-  int overlap_comm;
-};
-
-
-#endif // #ifndef _CUDA_SHARED_H_
--- a/lib/cuda/cuda_wrapper.cu
+++ b/lib/cuda/cuda_wrapper.cu
@ -1,337 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "cuda_wrapper_cu.h"
-#include "cuda_wrapper_kernel.cu"
-
-static int CudaWrapper_total_gpu_mem = 0;
-static double CudaWrapper_total_upload_time = 0;
-static double CudaWrapper_total_download_time = 0;
-static double CudaWrapper_cpubuffer_upload_time = 0;
-static double CudaWrapper_cpubuffer_download_time = 0;
-static cudaStream_t* streams;
-static int nstreams = 0;
-
-void CudaWrapper_Init(int argc, char** argv, int me, int ppn, int* devicelist)
-{
-  MYDBG(printf("# CUDA: debug mode on\n");)
-
-#if __DEVICE_EMULATION__
-
-  printf("# CUDA: emulation mode on\n");
-
-#else
-
-  // modified from cutil.h
-  static int deviceCount = 0;
-  static bool sharedmode = false;
-
-  if(deviceCount && !sharedmode) return;
-
-  if(deviceCount && sharedmode) cudaThreadExit();
-
-  CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceCount(&deviceCount));
-
-  if(deviceCount == 0) {
-    fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  MYDBG(printf("# CUDA There are %i devices supporting CUDA in this system.\n", deviceCount);)
-
-  cudaDeviceProp deviceProp[deviceCount];
-
-  for(int i = 0; i < deviceCount; i++)
-    CUDA_SAFE_CALL_NO_SYNC(cudaGetDeviceProperties(&(deviceProp[i]), i));
-
-
-  int dev_list[deviceCount];
-
-  for(int i = 0; i < deviceCount; i++) dev_list[i] = i;
-
-  for(int i = 0; i < deviceCount; i++) {
-    for(int j = 0; j < deviceCount - 1 - i; j++)
-      if(deviceProp[dev_list[j]].multiProcessorCount < deviceProp[dev_list[j + 1]].multiProcessorCount) {
-        int k = dev_list[j];
-        dev_list[j] = dev_list[j + 1];
-        dev_list[j + 1] = k;
-      }
-  }
-
-  for(int i = 0; i < deviceCount; i++) {
-    if((deviceProp[dev_list[i]].computeMode == 0)) sharedmode = true;
-
-    cudaSetDevice(i);
-    cudaSetDeviceFlags(cudaDeviceMapHost);
-  }
-
-  if(sharedmode) {
-    if(ppn && (me % ppn + 1) > deviceCount) {
-      printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n");
-      exit(0);
-    }
-
-    int devicea = me % ppn;
-
-    if(devicelist) devicea = devicelist[devicea];
-    else
-      devicea = dev_list[devicea];
-
-    if(devicea >= deviceCount)  {
-      printf("Asking for non existent GPU %i. Found only %i GPUs.\n", devicea, deviceCount);
-      exit(0);
-    }
-
-    MYDBG(
-      printf(" # CUDA  myid: %i take device: %i\n", me, devicea);
-    )
-    CUDA_SAFE_CALL(cudaSetDevice(devicea));
-  } else {
-    CUDA_SAFE_CALL(cudaSetValidDevices(dev_list, deviceCount));
-  }
-
-  cudaThreadSynchronize();
-
-  int dev;
-  CUDA_SAFE_CALL(cudaGetDevice(&dev));
-
-  if(deviceProp[dev].major < 1) {
-    fprintf(stderr, "CUDA error: device does not support CUDA.\n");
-    exit(EXIT_FAILURE);
-  } else if((deviceProp[dev].major == 1) && (deviceProp[dev].minor != 3)) {
-    fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n", dev, deviceProp[dev].name, deviceProp[dev].major, deviceProp[dev].minor);
-    exit(EXIT_FAILURE);
-  }
-
-  if((deviceProp[dev].major == 2) && (CUDA_ARCH < 20)) {
-    fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n", deviceProp[dev].major, deviceProp[dev].minor);
-  }
-
-  if((deviceProp[dev].major == 1) && (CUDA_ARCH >= 20)) {
-    fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n", CUDA_ARCH);
-    exit(EXIT_FAILURE);
-  }
-
-
-  fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
-  MYDBG(fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
-
-  MYDBG
-  (
-    printf("name = %s\n", deviceProp[dev].name);
-    printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
-    printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
-    printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
-    printf("warpSize = %i\n", deviceProp[dev].warpSize);
-    printf("memPitch = %i\n", deviceProp[dev].memPitch);
-    printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
-    printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
-    printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
-    printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
-    printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
-    printf("clockRate = %i\n", deviceProp[dev].clockRate);
-    printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
-    printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
-    printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
-    printf("computeMode = %i\n", deviceProp[dev].computeMode);
-  )
-
-#endif
-}
-
-void* CudaWrapper_AllocCudaData(unsigned nbytes)
-{
-  void* dev_data;
-  CUDA_SAFE_CALL(cudaMalloc((void**)&dev_data, nbytes));
-  MYDBG(printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data);)
-  CudaWrapper_total_gpu_mem += nbytes;
-  return dev_data;
-}
-
-void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
-{
-  MYDBG(printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data, host_data);)
-  cudaThreadSynchronize();
-  my_times time1, time2;
-  my_gettime(CLOCK_REALTIME, &time1);
-  CUDA_SAFE_CALL(cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice));
-  my_gettime(CLOCK_REALTIME, &time2);
-  CudaWrapper_total_upload_time +=
-    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-}
-
-void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
-{
-  MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
-  cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice, streams[stream]);
-}
-
-void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
-{
-  MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
-  cudaThreadSynchronize();
-  my_times time1, time2;
-  my_gettime(CLOCK_REALTIME, &time1);
-  CUDA_SAFE_CALL(cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost));
-  my_gettime(CLOCK_REALTIME, &time2);
-  CudaWrapper_total_download_time +=
-    time2.tv_sec - time1.tv_sec + 1.0 * (time2.tv_nsec - time1.tv_nsec) / 1000000000;
-}
-
-void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream)
-{
-  MYDBG(printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data);)
-  cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost, streams[stream]);
-}
-
-void CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes)
-{
-  MYDBG(printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data, nbytes, (char*)dev_data + nbytes);)
-  CUDA_SAFE_CALL(cudaFree(dev_data));
-  CudaWrapper_total_gpu_mem -= nbytes;
-}
-
-void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
-{
-  MYDBG(printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data);)
-  CUDA_SAFE_CALL(cudaMemset(dev_data, value, nbytes));
-}
-
-void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
-{
-  MYDBG(printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source, dev_dest);)
-  CUDA_SAFE_CALL(cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice));
-}
-
-void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped, bool writeCombined)
-{
-  void* host_data;
-  int flags = 0;
-
-  if(mapped) flags = flags | cudaHostAllocMapped;
-
-  if(writeCombined) flags = flags | cudaHostAllocWriteCombined;
-
-  CUDA_SAFE_CALL(cudaHostAlloc((void**)&host_data, nbytes, flags));
-  //	CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
-  MYDBG(printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data);)
-  return host_data;
-}
-
-void CudaWrapper_FreePinnedHostData(void* host_data)
-{
-  MYDBG(printf("# CUDA: freeing pinned host memory at %p \n", host_data);)
-
-  if(host_data)
-    CUDA_SAFE_CALL(cudaFreeHost(host_data));
-}
-
-void cuda_check_error(char* comment)
-{
-  printf("ERROR-CUDA %s %s\n", comment, cudaGetErrorString(cudaGetLastError()));
-}
-
-int CudaWrapper_CheckMemUsage()
-{
-  size_t free, total;
-  cudaMemGetInfo(&free, &total);
-  return total - free; //possible with cuda 3.0 ???
-  //return CudaWrapper_total_gpu_mem;
-}
-
-double CudaWrapper_CheckUploadTime(bool reset)
-{
-  if(reset) CudaWrapper_total_upload_time = 0.0;
-
-  return CudaWrapper_total_upload_time;
-}
-
-double CudaWrapper_CheckDownloadTime(bool reset)
-{
-  if(reset) CudaWrapper_total_download_time = 0.0;
-
-  return CudaWrapper_total_download_time;
-}
-
-double CudaWrapper_CheckCPUBufUploadTime(bool reset)
-{
-  if(reset) CudaWrapper_cpubuffer_upload_time = 0.0;
-
-  return CudaWrapper_cpubuffer_upload_time;
-}
-
-double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
-{
-  if(reset) CudaWrapper_cpubuffer_download_time = 0.0;
-
-  return CudaWrapper_cpubuffer_download_time;
-}
-
-void CudaWrapper_AddCPUBufUploadTime(double dt)
-{
-  CudaWrapper_cpubuffer_upload_time += dt;
-}
-
-void CudaWrapper_AddCPUBufDownloadTime(double dt)
-{
-  CudaWrapper_cpubuffer_download_time += dt;
-}
-
-void CudaWrapper_Sync()
-{
-  cudaThreadSynchronize();
-}
-
-void CudaWrapper_SyncStream(int stream)
-{
-  cudaStreamSynchronize(streams[stream]);
-}
-
-void CudaWrapper_AddStreams(int n)
-{
-  cudaStream_t* new_streams = new cudaStream_t[nstreams + n];
-
-  for(int i = 0; i < nstreams; i++) new_streams[i] = streams[i];
-
-  for(int i = nstreams; i < nstreams + n; i++) cudaStreamCreate(&new_streams[i]);
-
-  if(nstreams > 0)
-    delete [] streams;
-
-  streams = new_streams;
-  nstreams += n;
-}
-
-void* CudaWrapper_returnStreams()
-{
-  return (void*) streams;
-}
-
-int CudaWrapper_returnNStreams()
-{
-  return nstreams;
-}
-
--- a/lib/cuda/cuda_wrapper_cu.h
+++ b/lib/cuda/cuda_wrapper_cu.h
@ -1,52 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef _CUDA_DATA_WRAPPER_H_
-#define _CUDA_DATA_WRAPPER_H_
-
-extern "C" void  CudaWrapper_Init(int argc, char** argv, int me = 0, int ppn = 2, int* devicelist = NULL);
-extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes);
-extern "C" void  CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes);
-extern "C" void  CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
-extern "C" void  CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes);
-extern "C" void  CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
-extern "C" void  CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes = 0);
-extern "C" void  CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes);
-extern "C" void  CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes);
-extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped = false, bool writeCombind = false);
-extern "C" void  CudaWrapper_FreePinnedHostData(void* dev_data);
-extern "C" void  cuda_check_error(char* comment);
-extern "C" int   CudaWrapper_CheckMemUsage();
-extern "C" double CudaWrapper_CheckUploadTime(bool reset = false);
-extern "C" double CudaWrapper_CheckDownloadTime(bool reset = false);
-extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset = false);
-extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset = false);
-extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt);
-extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt);
-extern "C" void CudaWrapper_Sync();
-extern "C" void CudaWrapper_SyncStream(int n);
-extern "C" void CudaWrapper_AddStreams(int n);
-extern "C" void* CudaWrapper_returnStreams();
-extern "C" int CudaWrapper_returnNStreams();
-
-#endif // _CUDA_DATA_WRAPPER_H_
--- a/lib/cuda/cuda_wrapper_kernel.cu
+++ b/lib/cuda/cuda_wrapper_kernel.cu
@ -1,24 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-// empty file to obay common make rule
--- a/lib/cuda/domain.cu
+++ b/lib/cuda/domain.cu
@ -1,202 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX domain
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "domain_cu.h"
-#include "domain_kernel.cu"
-
-void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata, int size)
-{
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(tag)    , & sdata->atom.tag .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(image)   , & sdata->atom.image.dev_data, sizeof(int*));
-}
-
-void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(boxlo)   ,  sdata->domain.boxlo       , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(boxhi)   ,  sdata->domain.boxhi       , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(sublo)   ,  sdata->domain.sublo       , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(subhi)   ,  sdata->domain.subhi       , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(prd)     ,  sdata->domain.prd         , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(periodicity)   ,   sdata->domain.periodicity , 3 * sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(triclinic)     , & sdata->domain.triclinic   , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(boxlo_lamda)   ,   sdata->domain.boxlo_lamda , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(boxhi_lamda)   ,   sdata->domain.boxhi_lamda , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(prd_lamda)	   ,   sdata->domain.prd_lamda   , 3 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(h)	   	 ,   sdata->domain.h   		  , 6 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(h_inv)	 ,   sdata->domain.h_inv   	  , 6 * sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(h_rate)	 ,   sdata->domain.h_rate     , 6 * sizeof(V_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(flag)	 ,   &sdata->flag     , sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(debugdata)	 ,   &sdata->debugdata     , sizeof(int*));
-}
-
-void Cuda_Domain_Init(cuda_shared_data* sdata)
-{
-  Cuda_Domain_UpdateNmax(sdata);
-  Cuda_Domain_UpdateDomain(sdata);
-}
-
-void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent)
-{
-  Cuda_Domain_UpdateNmax(sdata);
-  //if(sdata->domain.update)
-  Cuda_Domain_UpdateDomain(sdata);
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int box_change = 0;
-
-  if(extent) box_change = 1;
-
-  int sharedmem = 0;
-
-  if(box_change) sharedmem = 6 * sizeof(X_CFLOAT);
-
-  int3 layout = getgrid(sdata->atom.nlocal, sharedmem);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  sharedmem *= threads.x;
-
-  if((box_change) && (sdata->buffer_new or (6 * sizeof(X_CFLOAT)*grid.x * grid.y > sdata->buffersize)))
-    Cuda_Domain_UpdateBuffer(sdata, layout.x * layout.y * 6 * sizeof(X_CFLOAT));
-
-
-  Domain_PBC_Kernel <<< grid, threads, sharedmem>>>(deform_remap, deform_groupbit, box_change);
-  cudaThreadSynchronize();
-
-  CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
-
-  if(box_change) {
-    X_CFLOAT buf2[6 * layout.x * layout.y];
-    X_CFLOAT* buf = buf2;
-    int flag;
-    cudaMemcpy(buf, sdata->buffer, 6 * layout.x * layout.y * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-    cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-    //printf("Flag: %i\n",flag);
-    X_CFLOAT min, max;
-    min = 1.0 * BIG;
-    max = -1.0 * BIG;
-
-    for(int i = 0; i < layout.x * layout.y; i++) {
-      if(buf[i] < min) min = buf[i];
-
-      if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
-    }
-
-    extent[0] = min;
-    extent[1] = max;
-
-    buf += 2 * layout.x * layout.y;
-    min = 1.0 * BIG;
-    max = -1.0 * BIG;
-
-    for(int i = 0; i < layout.x * layout.y; i++) {
-      if(buf[i] < min) min = buf[i];
-
-      if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
-    }
-
-    extent[2] = min;
-    extent[3] = max;
-
-    buf += 2 * layout.x * layout.y;
-    min = 1.0 * BIG;
-    max = -1.0 * BIG;
-
-    for(int i = 0; i < layout.x * layout.y; i++) {
-      if(buf[i] < min) min = buf[i];
-
-      if(buf[i + layout.x * layout.y] > max) max = buf[i + layout.x * layout.y];
-    }
-
-    extent[4] = min;
-    extent[5] = max;
-    //printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]);
-    /*	   int n=grid.x*grid.y;
-    	   if(n<128) threads.x=32;
-    	   else if(n<256) threads.x=64;
-    	   else threads.x=128;
-    	   sharedmem=n*sizeof(X_CFLOAT);
-    	   grid.x=6;
-    	   grid.y=1;
-    	   Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
-    	   cudaThreadSynchronize();
-    	   CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/
-  }
-}
-
-void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n)
-{
-  Cuda_Domain_UpdateNmax(sdata);
-  //if(sdata->domain.update)
-  Cuda_Domain_UpdateDomain(sdata);
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Domain_lamda2x_Kernel <<< grid, threads, 0>>>(n);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed");
-}
-
-void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n)
-{
-  Cuda_Domain_UpdateNmax(sdata);
-  //if(sdata->domain.update)
-  Cuda_Domain_UpdateDomain(sdata);
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Domain_x2lamda_Kernel <<< grid, threads, 0>>>(n);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed");
-}
--- a/lib/cuda/domain_cu.h
+++ b/lib/cuda/domain_cu.h
@ -1,29 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata, int deform_remap, int deform_groupbit, double* extent = NULL);
-extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata, int n);
-extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata, int n);
--- a/lib/cuda/domain_kernel.cu
+++ b/lib/cuda/domain_kernel.cu
@ -1,293 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ X_CFLOAT sharedmem[];
-
-#define BIG 1e10
-__global__ void Domain_PBC_Kernel(int deform_remap, int deform_groupbit, int box_change)
-{
-  int idim, otherdims;
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  X_CFLOAT lo[3];
-  X_CFLOAT hi[3];
-  X_CFLOAT* period;
-
-  if(_triclinic == 0) {
-    lo[0] = _boxlo[0];
-    lo[1] = _boxlo[1];
-    lo[2] = _boxlo[2];
-
-    hi[0] = _boxhi[0];
-    hi[1] = _boxhi[1];
-    hi[2] = _boxhi[2];
-    period = _prd;
-  } else {
-    lo[0] = _boxlo_lamda[0];
-    lo[1] = _boxlo_lamda[1];
-    lo[2] = _boxlo_lamda[2];
-
-    hi[0] = _boxhi_lamda[0];
-    hi[1] = _boxhi_lamda[1];
-    hi[2] = _boxhi_lamda[2];
-    period = _prd_lamda;
-  }
-
-
-  X_CFLOAT tmpx = X_F(0.5) * (hi[0] + lo[0]);
-  X_CFLOAT tmpy = X_F(0.5) * (hi[1] + lo[1]);
-  X_CFLOAT tmpz = X_F(0.5) * (hi[2] + lo[2]);
-
-  X_CFLOAT* buf = (X_CFLOAT*) _buffer;
-  buf += blockIdx.x * gridDim.y + blockIdx.y;
-  buf[0] = tmpx;
-  buf += gridDim.x * gridDim.y;
-  buf[0] = tmpx;
-  buf += gridDim.x * gridDim.y;
-  buf[0] = tmpy;
-  buf += gridDim.x * gridDim.y;
-  buf[0] = tmpy;
-  buf += gridDim.x * gridDim.y;
-  buf[0] = tmpz;
-  buf += gridDim.x * gridDim.y;
-  buf[0] = tmpz;
-
-  if(i < _nlocal) {
-
-    if(_periodicity[0]) {
-      if(_x[i] < lo[0]) {
-        _x[i] += period[0];
-
-        if(deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0];
-
-        idim = _image[i] & 1023;
-        otherdims = _image[i] ^ idim;
-        idim--;
-        idim &= 1023;
-        _image[i] = otherdims | idim;
-      }
-
-      if(_x[i] >= hi[0]) {
-        _x[i] -= period[0];
-        _x[i] = MAX(_x[i], lo[0]);
-
-        if(deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0];
-
-        idim = _image[i] & 1023;
-        otherdims = _image[i] ^ idim;
-        idim++;
-        idim &= 1023;
-        _image[i] = otherdims | idim;
-      }
-    }
-
-    if(_periodicity[1]) {
-      if(_x[i + _nmax] < lo[1]) {
-        _x[i + _nmax] += period[1];
-
-        if(deform_remap && _mask[i] & deform_groupbit) {
-          _v[i] += _h_rate[5];
-          _v[i + _nmax] += _h_rate[1];
-        }
-
-        idim = (_image[i] >> 10) & 1023;
-        otherdims = _image[i] ^ (idim << 10);
-        idim--;
-        idim &= 1023;
-        _image[i] = otherdims | (idim << 10);
-      }
-
-      if(_x[i + _nmax] >= hi[1]) {
-        _x[i + _nmax] -= period[1];
-        _x[i + _nmax] = MAX(_x[i + _nmax], lo[1]);
-
-        if(deform_remap && _mask[i] & deform_groupbit) {
-          _v[i] -= _h_rate[5];
-          _v[i + _nmax] -= _h_rate[1];
-        }
-
-        idim = (_image[i] >> 10) & 1023;
-        otherdims = _image[i] ^ (idim << 10);
-        idim++;
-        idim &= 1023;
-        _image[i] = otherdims | (idim << 10);
-      }
-    }
-
-    if(_periodicity[2]) {
-      if(_x[i + 2 * _nmax] < lo[2]) {
-        _x[i + 2 * _nmax] += period[2];
-
-        if(deform_remap && _mask[i] & deform_groupbit) {
-          _v[i] += _h_rate[4];
-          _v[i + _nmax] += _h_rate[3];
-          _v[i + 2 * _nmax] += _h_rate[2];
-        }
-
-        idim = _image[i] >> 20;
-        otherdims = _image[i] ^ (idim << 20);
-        idim--;
-        idim &= 1023;
-        _image[i] = otherdims | (idim << 20);
-      }
-
-      if(_x[i + 2 * _nmax] >= hi[2]) {
-        _x[i + 2 * _nmax] -= period[2];
-        _x[i + 2 * _nmax] = MAX(_x[i + 2 * _nmax], lo[2]);
-
-        if(deform_remap && _mask[i] & deform_groupbit) {
-          _v[i] -= _h_rate[4];
-          _v[i + _nmax] -= _h_rate[3];
-          _v[i + 2 * _nmax] -= _h_rate[2];
-        }
-
-        idim = _image[i] >> 20;
-        otherdims = _image[i] ^ (idim << 20);
-        idim++;
-        idim &= 1023;
-        _image[i] = otherdims | (idim << 20);
-      }
-    }
-
-    if(box_change) {
-      tmpx = _x[i];
-      tmpy = _x[i + _nmax];
-      tmpz = _x[i + 2 * _nmax];
-
-
-    }
-  }
-
-  __syncthreads();
-
-  if(box_change) {
-    X_CFLOAT minx = BIG;
-    X_CFLOAT maxx = -BIG;
-    X_CFLOAT miny = BIG;
-    X_CFLOAT maxy = -BIG;
-    X_CFLOAT minz = BIG;
-    X_CFLOAT maxz = -BIG;
-
-    if(not _periodicity[0]) {
-      sharedmem[threadIdx.x] = tmpx;
-      minOfBlock(sharedmem);
-      minx = sharedmem[0];
-      __syncthreads();
-      sharedmem[threadIdx.x] = tmpx;
-      maxOfBlock(sharedmem);
-      maxx = sharedmem[0];
-      __syncthreads();
-    } else {
-      minx = lo[0];
-      maxx = hi[0];
-    }
-
-    if(not _periodicity[1]) {
-      sharedmem[threadIdx.x] = tmpy;
-      minOfBlock(sharedmem);
-      miny = sharedmem[0];
-      __syncthreads();
-      sharedmem[threadIdx.x] = tmpy;
-      maxOfBlock(sharedmem);
-      maxy = sharedmem[0];
-      __syncthreads();
-    } else {
-      minx = lo[1];
-      maxx = hi[1];
-    }
-
-    if(not _periodicity[2]) {
-      sharedmem[threadIdx.x] = tmpz;
-      minOfBlock(sharedmem);
-      minz = sharedmem[0];
-      __syncthreads();
-      sharedmem[threadIdx.x] = tmpz;
-      maxOfBlock(sharedmem);
-      maxz = sharedmem[0];
-      __syncthreads();
-    } else {
-      minz = lo[2];
-      maxz = hi[2];
-    }
-
-    if(threadIdx.x == 0) {
-      buf = (X_CFLOAT*) _buffer;
-      buf += blockIdx.x * gridDim.y + blockIdx.y;
-      buf[0] = minx;
-      buf += gridDim.x * gridDim.y;
-      buf[0] = maxx;
-      buf += gridDim.x * gridDim.y;
-      buf[0] = miny;
-      buf += gridDim.x * gridDim.y;
-      buf[0] = maxy;
-      buf += gridDim.x * gridDim.y;
-      buf[0] = minz;
-      buf += gridDim.x * gridDim.y;
-      buf[0] = maxz;
-    }
-  }
-}
-
-__global__ void Domain_reduceBoxExtent(double* extent, int n)
-{
-  X_CFLOAT* buf = (X_CFLOAT*) _buffer;
-  buf += blockIdx.x * n;
-  copyGlobToShared(buf, sharedmem, n);
-
-  if(blockIdx.x % 2 == 0)
-    minOfData(sharedmem, n);
-  else
-    maxOfData(sharedmem, n);
-
-  extent[blockIdx.x] = sharedmem[0];
-}
-
-__global__ void Domain_lamda2x_Kernel(int n)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < n) {
-    X_CFLOAT ytmp = _x[i + _nmax];
-    X_CFLOAT ztmp = _x[i + 2 * _nmax];
-    _x[i] = _h[0] * _x[i] + _h[5] * ytmp + _h[4] * ztmp + _boxlo[0];
-    _x[i + _nmax] = _h[1] * ytmp + _h[3] * ztmp + _boxlo[1];
-    _x[i + 2 * _nmax] = _h[2] * ztmp + _boxlo[2];
-  }
-}
-
-__global__ void Domain_x2lamda_Kernel(int n)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  X_CFLOAT delta[3];
-
-  if(i < n) {
-    delta[0] = _x[i] - _boxlo[0];
-    delta[1] = _x[i + _nmax] - _boxlo[1];
-    delta[2] = _x[i + 2 * _nmax] - _boxlo[2];
-
-    _x[i] = _h_inv[0] * delta[0] + _h_inv[5] * delta[1] + _h_inv[4] * delta[2];
-    _x[i + _nmax] = _h_inv[1] * delta[1] + _h_inv[3] * delta[2];
-    _x[i + 2 * _nmax] = _h_inv[2] * delta[2];
-  }
-}
--- a/lib/cuda/fft3d_cuda.cu
+++ b/lib/cuda/fft3d_cuda.cu
@ -1,103 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-//#define CUDA_PRECISION 1
-#include "cuda_precision.h"
-#include "cuda_common.h"
-struct  FFT_DATA {
-  FFT_CFLOAT re;
-  FFT_CFLOAT im;
-};
-
-#include "fft3d_cuda_cu.h"
-#include "fft3d_cuda_kernel.cu"
-#include <stdio.h>
-
-void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow)
-{
-
-  dim3 grid;
-  grid.x = nslow;
-  grid.y = nmid;
-  grid.z = 1;
-  dim3 threads;
-  threads.x = nfast;
-  threads.y = 1;
-  threads.z = 1;
-  cudaThreadSynchronize();
-  initfftdata_kernel <<< grid, threads, 0>>>(in, out);
-  cudaThreadSynchronize();
-  MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
-}
-
-
-void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
-{
-
-  dim3 grid;
-  grid.x = nslow;
-  grid.y = nmid;
-  grid.z = 1;
-  dim3 threads;
-  threads.x = nfast * 2;
-  threads.y = 1;
-  threads.z = 1;
-  permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
-  cudaThreadSynchronize();
-  MYDBG(printf("ERROR-CUDA permute_kernel: %s\n", cudaGetErrorString(cudaGetLastError())));
-}
-
-void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow)
-{
-
-  dim3 grid;
-  grid.x = nslow;
-  grid.y = nmid;
-  grid.z = 1;
-  dim3 threads;
-  threads.x = nfast * 2;
-  threads.y = 1;
-  threads.z = 1;
-  permute_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out);
-  cudaThreadSynchronize();
-}
-void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
-{
-
-  dim3 grid;
-  grid.x = (ihi - ilo + 1);
-  grid.y = (jhi - jlo + 1);
-  grid.z = 1;
-  dim3 threads;
-  threads.x = (khi - klo + 1) * 2;
-  threads.y = 1;
-  threads.z = 1;
-  permute_part_kernel <<< grid, threads, 0>>>((FFT_CFLOAT*)in, (FFT_CFLOAT*)out, nfast, nmid, nslow, ihi, ilo, jhi, jlo, khi, klo);
-  cudaThreadSynchronize();
-}
-
-void FFTsyncthreads()
-{
-  cudaThreadSynchronize();
-}
-
--- a/lib/cuda/fft3d_cuda_cu.h
+++ b/lib/cuda/fft3d_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void initfftdata(double* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow);
-extern "C" void permute(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
-extern "C" void permute_scale(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow);
-extern "C" void permute_part(FFT_DATA* in, FFT_DATA* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo);
-extern "C" void FFTsyncthreads();
--- a/lib/cuda/fft3d_cuda_kernel.cu
+++ b/lib/cuda/fft3d_cuda_kernel.cu
@ -1,46 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void initfftdata_kernel(double* in, FFT_CFLOAT* out)
-{
-  out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
-  out[2 * (((blockIdx.x * gridDim.y + blockIdx.y)*blockDim.x) + threadIdx.x) + 1] = 0;
-}
-
-
-__global__ void permute_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
-{
-  out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x];
-}
-
-__global__ void permute_scale_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out)
-{
-  out[2 * (((threadIdx.x / 2)*gridDim.x + blockIdx.x)*gridDim.y + blockIdx.y) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[((blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x) + threadIdx.x] * gridDim.x * gridDim.y * blockDim.x * 0.5;
-}
-
-__global__ void permute_part_kernel(FFT_CFLOAT* in, FFT_CFLOAT* out, int nfast, int nmid, int nslow, int ihi, int ilo, int jhi, int jlo, int khi, int klo)
-{
-  {
-    out[2 * ((threadIdx.x / 2) * (ihi - ilo + 1) * (jhi - jlo + 1) + (blockIdx.x) * (jhi - jlo + 1) + blockIdx.y - jlo) + threadIdx.x - 2 * (threadIdx.x / 2)] = in[2 * (blockIdx.x + ilo) * nmid * nslow + 2 * (blockIdx.y + jlo) * nmid + threadIdx.x + 2 * klo];
-  }
-}
--- a/lib/cuda/fix_addforce_cuda.cu
+++ b/lib/cuda/fix_addforce_cuda.cu
@ -1,93 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_add_force_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "fix_addforce_cuda_cu.h"
-#include "fix_addforce_cuda_kernel.cu"
-
-void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-}
-
-void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixAddForceCuda_UpdateNmax(sdata);
-}
-
-void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixAddForceCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixAddForceCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixAddForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit, axvalue, ayvalue, azvalue);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
-
-  int oldgrid = grid.x;
-  grid.x = 4;
-  threads.x = 512;
-  reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
-
-}
--- a/lib/cuda/fix_addforce_cuda_cu.h
+++ b/lib/cuda/fix_addforce_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue, F_CFLOAT* aforiginal);
--- a/lib/cuda/fix_addforce_cuda_kernel.cu
+++ b/lib/cuda/fix_addforce_cuda_kernel.cu
@ -1,90 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_CFLOAT sharedmem[];
-
-
-__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  sharedmem[threadIdx.x] = 0;
-  sharedmem[threadIdx.x + blockDim.x] = 0;
-  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit)
-      //if (iregion >= 0 &&
-      //match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported
-    {
-      sharedmem[threadIdx.x] = -xvalue * _x[i] - yvalue * _x[i + 1 * _nmax] - zvalue * _x[i + 2 * _nmax];
-      sharedmem[threadIdx.x + blockDim.x] = _f[i];
-      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 1 * _nmax];
-      sharedmem[threadIdx.x + 3 * blockDim.x] = _f[i + 2 * _nmax];
-      _f[i] += xvalue;
-      _f[i + 1 * _nmax] += yvalue;
-      _f[i + 2 * _nmax] += zvalue;
-    }
-
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2 * blockDim.x]);
-  reduceBlock(&sharedmem[3 * blockDim.x]);
-  F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
-  }
-
-}
-
-
-__global__ void reduce_foriginal(int n, F_CFLOAT* foriginal)
-{
-  int i = 0;
-  sharedmem[threadIdx.x] = 0;
-  F_CFLOAT myforig = 0.0;
-  F_CFLOAT* buf = (F_CFLOAT*) _buffer;
-  buf = &buf[blockIdx.x * n];
-
-  while(i < n) {
-    sharedmem[threadIdx.x] = 0;
-
-    if(i + threadIdx.x < n)
-      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
-
-    __syncthreads();
-    reduceBlock(sharedmem);
-    i += blockDim.x;
-
-    if(threadIdx.x == 0)
-      myforig += sharedmem[0];
-  }
-
-  if(threadIdx.x == 0)
-    foriginal[blockIdx.x] = myforig;
-}
--- a/lib/cuda/fix_aveforce_cuda.cu
+++ b/lib/cuda/fix_aveforce_cuda.cu
@ -1,107 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_ave_force_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "fix_aveforce_cuda_cu.h"
-#include "fix_aveforce_cuda_kernel.cu"
-
-void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  int size = (unsigned)(layout.z * layout.y * layout.x) * 4 * sizeof(F_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-}
-
-void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixAveForceCuda_UpdateNmax(sdata);
-}
-
-void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixAveForceCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixAveForceCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(sdata->atom.nlocal, 4 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-
-  Cuda_FixAveForceCuda_PostForce_FOrg_Kernel <<< grid, threads, threads.x* 4* sizeof(F_CFLOAT)>>> (groupbit);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
-
-  int oldgrid = grid.x;
-  grid.x = 4;
-  threads.x = 512;
-  Cuda_FixAveForceCuda_reduce_foriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, aforiginal);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
-
-}
-
-void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue)
-{
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-
-  Cuda_FixAveForceCuda_PostForce_Set_Kernel <<< grid, threads, 0>>> (groupbit, xflag, yflag, zflag, axvalue, ayvalue, azvalue);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed");
-
-}
--- a/lib/cuda/fix_aveforce_cuda_cu.h
+++ b/lib/cuda/fix_aveforce_cuda_cu.h
@ -1,28 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit, F_CFLOAT* aforiginal);
-extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit, int xflag, int yflag, int zflag, F_CFLOAT axvalue, F_CFLOAT ayvalue, F_CFLOAT azvalue);
--- a/lib/cuda/fix_aveforce_cuda_kernel.cu
+++ b/lib/cuda/fix_aveforce_cuda_kernel.cu
@ -1,96 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_CFLOAT sharedmem[];
-
-
-__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  sharedmem[threadIdx.x] = 0;
-  sharedmem[threadIdx.x + blockDim.x] = 0;
-  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
-  sharedmem[threadIdx.x + 3 * blockDim.x] = 0;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      sharedmem[threadIdx.x] = _f[i];
-      sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
-      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
-      sharedmem[threadIdx.x + 3 * blockDim.x] = 1;
-    }
-
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2 * blockDim.x]);
-  reduceBlock(&sharedmem[3 * blockDim.x]);
-  F_CFLOAT* buffer = (F_CFLOAT*) _buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = sharedmem[3 * blockDim.x];
-  }
-}
-
-
-__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n, F_CFLOAT* foriginal)
-{
-  int i = 0;
-  sharedmem[threadIdx.x] = 0;
-  F_CFLOAT myforig = 0.0;
-  F_CFLOAT* buf = (F_CFLOAT*) _buffer;
-  buf = &buf[blockIdx.x * n];
-
-  while(i < n) {
-    sharedmem[threadIdx.x] = 0;
-
-    if(i + threadIdx.x < n)
-      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
-
-    __syncthreads();
-    reduceBlock(sharedmem);
-    i += blockDim.x;
-
-    if(threadIdx.x == 0)
-      myforig += sharedmem[0];
-  }
-
-  if(threadIdx.x == 0)
-    foriginal[blockIdx.x] = myforig;
-}
-
-__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit, int xflag, int yflag, int zflag, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      if(xflag) _f[i] = xvalue;
-
-      if(yflag) _f[i + 1 * _nmax] = yvalue;
-
-      if(zflag) _f[i + 2 * _nmax] = zvalue;
-    }
-}
--- a/lib/cuda/fix_enforce2d_cuda.cu
+++ b/lib/cuda/fix_enforce2d_cuda.cu
@ -1,55 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_enforce2d_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_enforce2d_cuda_cu.h"
-#include "fix_enforce2d_cuda_kernel.cu"
-
-void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-}
-
-void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixEnforce2dCuda_Init(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  Cuda_FixEnforce2dCuda_PostForce_Kernel <<< grid, threads>>> (groupbit);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed");
-}
--- a/lib/cuda/fix_enforce2d_cuda_cu.h
+++ b/lib/cuda/fix_enforce2d_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit);
--- a/lib/cuda/fix_enforce2d_cuda_kernel.cu
+++ b/lib/cuda/fix_enforce2d_cuda_kernel.cu
@ -1,34 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      _v[i + 2 * _nmax] = V_F(0.0);
-      _f[i + 2 * _nmax] = F_F(0.0);
-    }
-}
--- a/lib/cuda/fix_freeze_cuda.cu
+++ b/lib/cuda/fix_freeze_cuda.cu
@ -1,98 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_freeze_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_freeze_cuda_cu.h"
-#include "fix_freeze_cuda_kernel.cu"
-
-void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(torque)  , & sdata->atom.torque .dev_data, sizeof(F_CFLOAT*));
-}
-
-
-void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixFreezeCuda_UpdateNmax(sdata);
-
-}
-
-
-void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixFreezeCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixFreezeCuda_UpdateBuffer(sdata);
-
-
-  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixFreezeCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-
-  int oldgrid = grid.x;
-  grid.x = 3;
-  threads.x = 512;
-  Cuda_FixFreezeCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
-
-}
--- a/lib/cuda/fix_freeze_cuda_cu.h
+++ b/lib/cuda/fix_freeze_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT* foriginal);
--- a/lib/cuda/fix_freeze_cuda_kernel.cu
+++ b/lib/cuda/fix_freeze_cuda_kernel.cu
@ -1,87 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_CFLOAT sharedmem[];
-
-
-__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  sharedmem[threadIdx.x] = 0;
-  sharedmem[threadIdx.x + blockDim.x] = 0;
-  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      sharedmem[threadIdx.x] = _f[i];
-      sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
-      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
-
-      _f[i] = F_F(0.0);
-      _f[i + 1 * _nmax] = F_F(0.0);
-      _f[i + 2 * _nmax] = F_F(0.0);
-      _torque[i] = F_F(0.0);
-      _torque[i + 1 * _nmax] = F_F(0.0);
-      _torque[i + 2 * _nmax] = F_F(0.0);
-    }
-
-
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2 * blockDim.x]);
-  F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
-  }
-}
-
-
-__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
-{
-  int i = 0;
-  sharedmem[threadIdx.x] = 0;
-  F_CFLOAT myforig = 0.0;
-  F_CFLOAT* buf = (F_CFLOAT*)_buffer;
-  buf = &buf[blockIdx.x * n];
-
-  while(i < n) {
-    sharedmem[threadIdx.x] = 0;
-
-    if(i + threadIdx.x < n)
-      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
-
-    __syncthreads();
-    reduceBlock(sharedmem);
-    i += blockDim.x;
-
-    if(threadIdx.x == 0)
-      myforig += sharedmem[0];
-  }
-
-  if(threadIdx.x == 0)
-    foriginal[blockIdx.x] = myforig;
-}
-
--- a/lib/cuda/fix_gravity_cuda.cu
+++ b/lib/cuda/fix_gravity_cuda.cu
@ -1,92 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_gravity_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_gravity_cuda_cu.h"
-#include "fix_gravity_cuda_kernel.cu"
-
-void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)       , & sdata->atom.type    .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(rmass_flag)       , & sdata->atom.rmass_flag, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(rmass)       , & sdata->atom.rmass    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(mass)       , & sdata->atom.mass    .dev_data, sizeof(V_CFLOAT*));
-}
-
-void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixGravityCuda_UpdateNmax(sdata);
-
-}
-
-
-void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixGravityCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixGravityCuda_UpdateBuffer(sdata);
-
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixGravityCuda_PostForce_Kernel <<< grid, threads>>> (groupbit, xacc, yacc, zacc);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_gravity_cuda_cu.h
+++ b/lib/cuda/fix_gravity_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc);
--- a/lib/cuda/fix_gravity_cuda_kernel.cu
+++ b/lib/cuda/fix_gravity_cuda_kernel.cu
@ -1,36 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit, F_CFLOAT xacc, F_CFLOAT yacc, F_CFLOAT zacc)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      F_CFLOAT mass = _rmass_flag ? _rmass[i] : _mass[_type[i]];
-      _f[i] += mass * xacc;
-      _f[i + 1 * _nmax] += mass * yacc;
-      _f[i + 2 * _nmax] += mass * zacc;
-    }
-}
-
--- a/lib/cuda/fix_nh_cuda.cu
+++ b/lib/cuda/fix_nh_cuda.cu
@ -1,255 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_nh_cuda
-#define IncludeCommonNeigh
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_nh_cuda_cu.h"
-#include "fix_nh_cuda_kernel.cu"
-
-void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(debugdata)     , & sdata->debugdata, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*));  //might be moved to a neighbor record in sdata
-  cudaMemcpyToSymbol(MY_AP(maxhold)   , & sdata->atom.maxhold, sizeof(int));  //might be moved to a neighbor record in sdata
-  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
-  cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
-}
-
-void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int size = (unsigned)10 * sizeof(int);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
-}
-
-void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
-{
-  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(dtf)     , & dtf                       		, sizeof(V_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(dtv)     , & dtv                            , sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check       , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int));       //
-  Cuda_FixNHCuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  my_times atime1, atime2;
-  my_gettime(CLOCK_REALTIME, &atime1);
-
-  if(sdata->atom.update_nmax)
-    Cuda_FixNHCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  my_gettime(CLOCK_REALTIME, &atime2);
-  sdata->cuda_timings.test1 +=
-    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-
-  if(sdata->buffer_new)
-    Cuda_FixNHCuda_UpdateBuffer(sdata);
-
-  F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
-  F_CFLOAT3 factor2;
-
-  if(p_triclinic) {
-    factor2.x = factor_h[3], factor2.y = factor_h[4];
-    factor2.z = factor_h[5];
-  }
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  FixNHCuda_nh_v_press_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
-
-}
-
-void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixNHCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixNHCuda_UpdateBuffer(sdata);
-
-  F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
-  F_CFLOAT3 factor2;
-
-  if(p_triclinic) {
-    factor2.x = factor_h[3], factor2.y = factor_h[4];
-    factor2.z = factor_h[5];
-  }
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed");
-  FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
-
-}
-
-void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  my_times atime1, atime2;
-  my_gettime(CLOCK_REALTIME, &atime1);
-
-  if(sdata->atom.update_nmax)
-    Cuda_FixNHCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  my_gettime(CLOCK_REALTIME, &atime2);
-  sdata->cuda_timings.test1 +=
-    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-
-  if(sdata->buffer_new)
-    Cuda_FixNHCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  FixNHCuda_nh_v_temp_Kernel <<< grid, threads>>> (groupbit, factor_eta);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed");
-
-}
-void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  my_times atime1, atime2;
-  my_gettime(CLOCK_REALTIME, &atime1);
-
-  if(sdata->atom.update_nmax)
-    Cuda_FixNHCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  my_gettime(CLOCK_REALTIME, &atime2);
-  sdata->cuda_timings.test1 +=
-    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-
-  if(sdata->buffer_new)
-    Cuda_FixNHCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  FixNHCuda_nve_v_Kernel <<< grid, threads>>> (groupbit);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed");
-}
-
-
-void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal) //mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  my_times atime1, atime2;
-  my_gettime(CLOCK_REALTIME, &atime1);
-
-  if(sdata->atom.update_nmax)
-    Cuda_FixNHCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  my_gettime(CLOCK_REALTIME, &atime2);
-  sdata->cuda_timings.test1 +=
-    atime2.tv_sec - atime1.tv_sec + 1.0 * (atime2.tv_nsec - atime1.tv_nsec) / 1000000000;
-
-  if(sdata->buffer_new)
-    Cuda_FixNHCuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  cudaMemset(sdata->buffer, 0, sizeof(int));
-  FixNHCuda_nve_x_Kernel <<< grid, threads>>> (groupbit);
-  cudaThreadSynchronize();
-  int reneigh_flag;
-  cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
-  sdata->atom.reneigh_flag += reneigh_flag;
-  CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed");
-}
-
-void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic) //mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixNHCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixNHCuda_UpdateBuffer(sdata);
-
-  F_CFLOAT3 factor = {factor_h[0], factor_h[1], factor_h[2]};
-  F_CFLOAT3 factor2;
-
-  if(p_triclinic) {
-    factor2.x = factor_h[3], factor2.y = factor_h[4];
-    factor2.z = factor_h[5];
-  }
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel <<< grid, threads>>> (groupbit, factor, p_triclinic, factor2);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias:   Kernel execution failed");
-}
-
--- a/lib/cuda/fix_nh_cuda_cu.h
+++ b/lib/cuda/fix_nh_cuda_cu.h
@ -1,32 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
-extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_CFLOAT factor_eta, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit, int mynlocal); //mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h, int mynlocal, int p_triclinic); //mynlocal can be nfirst if firstgroup==igroup  see cpp
--- a/lib/cuda/fix_nh_cuda_kernel.cu
+++ b/lib/cuda/fix_nh_cuda_kernel.cu
@ -1,205 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
-{
-  if(_dist_check) {
-
-    X_CFLOAT d = X_F(0.0);
-
-    if(i < _nlocal) {
-      X_CFLOAT tmp = xtmp - _xhold[i];
-      d = tmp * tmp;
-      tmp = ytmp - _xhold[i + _maxhold];
-      d += tmp * tmp;
-      tmp = ztmp - _xhold[i + 2 * _maxhold];
-      d += tmp * tmp;
-
-      d = ((_mask[i] & groupbit)) ? d : X_F(0.0);
-    }
-
-    if(not __all(d <= _triggerneighsq))
-      _reneigh_flag[0] = 1;
-  }
-}
-
-__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    V_CFLOAT* my_v = _v + i;
-    V_CFLOAT vx = my_v[0];
-    V_CFLOAT vy = my_v[_nmax];
-    V_CFLOAT vz = my_v[2 * _nmax];
-    vx *= factor.x;
-    vy *= factor.y;
-    vz *= factor.z;
-
-    if(p_triclinic) {
-      vx += vy * factor2.z + vz * factor2.y;
-      vy += vz * factor2.x;
-    }
-
-    vx *= factor.x;
-    vy *= factor.y;
-    vz *= factor.z;
-    my_v[0]       = vx;
-    my_v[_nmax]   = vy;
-    my_v[2 * _nmax] = vz;
-  }
-
-}
-
-__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_CFLOAT factor_eta)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    V_CFLOAT* my_v = _v + i;
-    my_v[0] *= factor_eta;
-    my_v[_nmax] *= factor_eta;
-    my_v[2 * _nmax] *= factor_eta;
-  }
-
-}
-
-__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    F_CFLOAT* my_f = _f + i;
-    V_CFLOAT* my_v = _v + i;
-
-    V_CFLOAT 		dtfm = _dtf;
-
-    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
-    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
-
-    V_CFLOAT vx = my_v[0];
-    V_CFLOAT vy = my_v[_nmax];
-    V_CFLOAT vz = my_v[2 * _nmax];
-    vx *= factor.x;
-    vy *= factor.y;
-    vz *= factor.z;
-
-    if(p_triclinic) {
-      vx += vy * factor2.z + vz * factor2.y;
-      vy += vz * factor2.x;
-    }
-
-    vx *= factor.x;
-    vy *= factor.y;
-    vz *= factor.z;
-    my_v[0]       = vx + dtfm * my_f[0];
-    my_v[_nmax]   = vy + dtfm * my_f[_nmax];
-    my_v[2 * _nmax] = vz + dtfm * my_f[_nmax * 2];
-  }
-
-}
-
-__global__ void FixNHCuda_nve_v_Kernel(int groupbit)
-{
-
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    F_CFLOAT* my_f = _f + i;
-    V_CFLOAT* my_v = _v + i;
-
-    V_CFLOAT 		dtfm = _dtf;
-
-    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
-    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
-
-    *my_v = (*my_v + dtfm * (*my_f));
-    my_f += _nmax;
-    my_v += _nmax;
-    *my_v = (*my_v + dtfm * (*my_f));
-    my_f += _nmax;
-    my_v += _nmax;
-    *my_v = (*my_v + dtfm * (*my_f));
-  }
-}
-
-__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
-{
-  X_CFLOAT xtmp, ytmp, ztmp;
-
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    V_CFLOAT* my_v = _v + i;
-    X_CFLOAT* my_x = _x + i;
-
-    xtmp = *my_x += _dtv * *my_v;
-    my_v += _nmax;
-    my_x += _nmax;
-    ytmp = *my_x += _dtv * *my_v;
-    my_v += _nmax;
-    my_x += _nmax;
-    ztmp = *my_x += _dtv * *my_v;
-  }
-
-  check_distance(xtmp, ytmp, ztmp, i, groupbit);
-}
-
-
-__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_CFLOAT3 factor, int p_triclinic, F_CFLOAT3 factor2)
-{
-
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    F_CFLOAT* my_f = _f + i;
-    V_CFLOAT* my_v = _v + i;
-
-    V_CFLOAT 		dtfm = _dtf;
-
-    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
-    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
-
-    V_CFLOAT vx = my_v[0] + dtfm * my_f[0];
-    V_CFLOAT vy = my_v[_nmax] + dtfm * my_f[_nmax];
-    V_CFLOAT vz = my_v[2 * _nmax] + dtfm * my_f[2 * _nmax];
-
-    vx *= factor.x;
-    vy *= factor.y;
-    vz *= factor.z;
-
-    if(p_triclinic) {
-      vx += vy * factor2.z + vz * factor2.y;
-      vy += vz * factor2.x;
-    }
-
-    vx *= factor.x;
-    vy *= factor.y;
-    vz *= factor.z;
-    my_v[0]       = vx;
-    my_v[_nmax]   = vy;
-    my_v[2 * _nmax] = vz;
-
-  }
-}
-
--- a/lib/cuda/fix_nve_cuda.cu
+++ b/lib/cuda/fix_nve_cuda.cu
@ -1,134 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_nve_cuda
-#define IncludeCommonNeigh
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_nve_cuda_cu.h"
-#include "fix_nve_cuda_kernel.cu"
-
-void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_CFLOAT*));  //might be moved to a neighbor record in sdata
-  cudaMemcpyToSymbol(MY_AP(maxhold)   , & sdata->atom.maxhold, sizeof(int));  //might be moved to a neighbor record in sdata
-  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
-  cudaMemcpyToSymbol(MY_AP(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_CFLOAT)); //might be moved to a neighbor record in sdata
-}
-
-void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int size = (unsigned)10 * sizeof(int);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(reneigh_flag), & sdata->buffer, sizeof(int*));  //might be moved to a neighbor record in sdata
-}
-
-void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf)
-{
-  cudaMemcpyToSymbol(MY_AP(mass)    , & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(dtf)     , & dtf                       		, sizeof(V_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(dtv)     , & dtv                            , sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(dist_check), & sdata->atom.dist_check       , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int));       //
-  Cuda_FixNVECuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixNVECuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixNVECuda_UpdateBuffer(sdata);
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  cudaMemset(sdata->buffer, 0, sizeof(int));
-  FixNVECuda_InitialIntegrate_Kernel <<< grid, threads>>> (groupbit);
-  cudaThreadSynchronize();
-  int reneigh_flag;
-  cudaMemcpy((void*)(&reneigh_flag), sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
-  sdata->atom.reneigh_flag += reneigh_flag;
-  CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed");
-
-}
-
-void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixNVECuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal) , & sdata->atom.nlocal , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixNVECuda_UpdateBuffer(sdata);
-
-#ifdef CUDA_USE_BINNING
-
-  dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
-  dim3 threads(sdata->domain.bin_nmax, 1, 1);
-  FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed");
-
-#else
-
-  int3 layout = getgrid(mynlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  FixNVECuda_FinalIntegrate_Kernel <<< grid, threads>>> (groupbit);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed");
-
-#endif
-}
-
--- a/lib/cuda/fix_nve_cuda_cu.h
+++ b/lib/cuda/fix_nve_cuda_cu.h
@ -1,28 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, V_CFLOAT dtf);
-extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
-extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
--- a/lib/cuda/fix_nve_cuda_kernel.cu
+++ b/lib/cuda/fix_nve_cuda_kernel.cu
@ -1,166 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-static inline __device__ void check_distance(X_CFLOAT &xtmp, X_CFLOAT &ytmp, X_CFLOAT &ztmp, int &i, int groupbit)
-{
-  if(_dist_check) {
-    X_CFLOAT tmp = xtmp - _xhold[i];
-    X_CFLOAT d = tmp * tmp;
-    tmp = ytmp - _xhold[i + _maxhold];
-    d += tmp * tmp;
-    tmp = ztmp - _xhold[i + 2 * _maxhold];
-    d += tmp * tmp;
-
-    d = ((i < _nlocal) && (_mask[i] & groupbit)) ? d : X_F(0.0);
-
-    if(not __all(d <= _triggerneighsq))
-      _reneigh_flag[0] = 1;
-  }
-}
-
-
-__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
-{
-  X_CFLOAT xtmp, ytmp, ztmp;
-#ifdef CUDA_USE_BINNING
-
-  const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
-
-  if(threadIdx.x < _bin_count_local[bin]) {
-    const int i = 3 * blockDim.x * bin + threadIdx.x;
-
-    if(_mask[i] & groupbit) {
-      F_CFLOAT* my_f = _binned_f + i;
-      V_CFLOAT* my_v = _binned_v + i;
-      X_CFLOAT* my_x = _binned_x + i;
-
-      V_CFLOAT 		dtfm = _dtf
-
-                         if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
-      else 			dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
-
-      V_CFLOAT v_mem;
-      v_mem = *my_v += dtfm * (*my_f);
-      xtmp = *my_x += _dtv * v_mem;
-      my_f += blockDim.x;
-      my_v += blockDim.x;
-      my_x += blockDim.x;
-      v_mem = *my_v += dtfm * (*my_f);
-      ytmp = *my_x += _dtv * v_mem;
-      my_f += blockDim.x;
-      my_v += blockDim.x;
-      my_x += blockDim.x;
-      v_mem = *my_v += dtfm * (*my_f);
-      ztmp = *my_x += _dtv * v_mem;
-    }
-  }
-
-#else
-
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    F_CFLOAT* my_f = _f + i;
-    V_CFLOAT* my_v = _v + i;
-    X_CFLOAT* my_x = _x + i;
-
-    V_CFLOAT 		dtfm = _dtf;
-
-    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
-    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
-
-    V_CFLOAT v_mem;
-    v_mem = *my_v += dtfm * (*my_f);
-    xtmp = *my_x += _dtv * v_mem;
-    my_f += _nmax;
-    my_v += _nmax;
-    my_x += _nmax;
-    v_mem = *my_v += dtfm * (*my_f);
-    ytmp = *my_x += _dtv * v_mem;
-    my_f += _nmax;
-    my_v += _nmax;
-    my_x += _nmax;
-    v_mem = *my_v += dtfm * (*my_f);
-    ztmp = *my_x += _dtv * v_mem;
-  }
-
-#endif
-
-  check_distance(xtmp, ytmp, ztmp, i, groupbit);
-}
-
-__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
-{
-#ifdef CUDA_USE_BINNING
-
-  const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
-
-  if(threadIdx.x < _bin_count_local[bin]) {
-    const int i = 3 * blockDim.x * bin + threadIdx.x;
-
-    if(_mask[i] & groupbit) {
-      F_CFLOAT* my_f = _binned_f + i;
-      V_CFLOAT* my_v = _binned_v + i;
-
-      V_CFLOAT 		dtfm = _dtf
-
-                         if(_rmass_flag) dtfm *= V_F(1.0) / _binned_rmass[i];
-      else 			dtfm *= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
-
-      *my_v += dtfm * (*my_f);
-      my_f += blockDim.x;
-      my_v += blockDim.x;
-      *my_v += dtfm * (*my_f);
-      my_f += blockDim.x;
-      my_v += blockDim.x;
-      *my_v += dtfm * (*my_f);
-    }
-  }
-
-#else
-
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal && _mask[i] & groupbit) {
-    F_CFLOAT* my_f = _f + i;
-    V_CFLOAT* my_v = _v + i;
-
-    V_CFLOAT 		dtfm = _dtf;
-
-    if(_rmass_flag) dtfm *= V_F(1.0) / _rmass[i];
-    else 			dtfm *= V_F(1.0) / _mass[_type[i]];
-
-    *my_v += dtfm * (*my_f);
-    my_f += _nmax;
-    my_v += _nmax;
-    *my_v += dtfm * (*my_f);
-    my_f += _nmax;
-    my_v += _nmax;
-    *my_v += dtfm * (*my_f);
-  }
-
-#endif
-}
-
-
-
--- a/lib/cuda/fix_set_force_cuda.cu
+++ b/lib/cuda/fix_set_force_cuda.cu
@ -1,96 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_set_force_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_set_force_cuda_cu.h"
-#include "fix_set_force_cuda_kernel.cu"
-
-void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-  int size = (unsigned)(layout.z * layout.y * layout.x) * 3 * sizeof(F_CFLOAT);
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-}
-
-void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixSetForceCuda_UpdateNmax(sdata);
-
-}
-
-
-void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixSetForceCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixSetForceCuda_UpdateBuffer(sdata);
-
-
-  int3 layout = getgrid(sdata->atom.nlocal, 3 * sizeof(F_CFLOAT));
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixSetForceCuda_PostForce_Kernel <<< grid, threads, threads.x* 3* sizeof(F_CFLOAT)>>> (groupbit, xvalue, yvalue, zvalue, flagx, flagy, flagz);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-
-  int oldgrid = grid.x;
-  grid.x = 3;
-  threads.x = 512;
-  Cuda_FixSetForceCuda_Reduce_FOriginal <<< grid, threads, threads.x* sizeof(F_CFLOAT)>>> (oldgrid, foriginal);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
-
-}
--- a/lib/cuda/fix_set_force_cuda_cu.h
+++ b/lib/cuda/fix_set_force_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, F_CFLOAT* foriginal, int flagx, int flagy, int flagz);
--- a/lib/cuda/fix_set_force_cuda_kernel.cu
+++ b/lib/cuda/fix_set_force_cuda_kernel.cu
@ -1,86 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_CFLOAT sharedmem[];
-
-
-__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit, F_CFLOAT xvalue, F_CFLOAT yvalue, F_CFLOAT zvalue, int flagx, int flagy, int flagz)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  sharedmem[threadIdx.x] = 0;
-  sharedmem[threadIdx.x + blockDim.x] = 0;
-  sharedmem[threadIdx.x + 2 * blockDim.x] = 0;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      sharedmem[threadIdx.x] = _f[i];
-      sharedmem[threadIdx.x + blockDim.x] = _f[i + 1 * _nmax];
-      sharedmem[threadIdx.x + 2 * blockDim.x] = _f[i + 2 * _nmax];
-
-      if(flagx) _f[i] = xvalue;
-
-      if(flagy) _f[i + 1 * _nmax] = yvalue;
-
-      if(flagz) _f[i + 2 * _nmax] = zvalue;
-    }
-
-
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2 * blockDim.x]);
-  F_CFLOAT* buffer = (F_CFLOAT*)_buffer;
-
-  if(threadIdx.x == 0) {
-    buffer[blockIdx.x * gridDim.y + blockIdx.y] = sharedmem[0];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + gridDim.x * gridDim.y] = sharedmem[blockDim.x];
-    buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = sharedmem[2 * blockDim.x];
-  }
-}
-
-
-__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n, F_CFLOAT* foriginal)
-{
-  int i = 0;
-  sharedmem[threadIdx.x] = 0;
-  F_CFLOAT myforig = 0.0;
-  F_CFLOAT* buf = (F_CFLOAT*)_buffer;
-  buf = &buf[blockIdx.x * n];
-
-  while(i < n) {
-    sharedmem[threadIdx.x] = 0;
-
-    if(i + threadIdx.x < n)
-      sharedmem[threadIdx.x] = buf[i + threadIdx.x];
-
-    __syncthreads();
-    reduceBlock(sharedmem);
-    i += blockDim.x;
-
-    if(threadIdx.x == 0)
-      myforig += sharedmem[0];
-  }
-
-  if(threadIdx.x == 0)
-    foriginal[blockIdx.x] = myforig;
-}
-
--- a/lib/cuda/fix_shake_cuda.cu
+++ b/lib/cuda/fix_shake_cuda.cu
@ -1,297 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_shake_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_shake_cuda_cu.h"
-#include "cuda_pair_virial_kernel_nc.cu"
-
-#define _shake_atom           MY_AP(shake_atom)
-#define _shake_type           MY_AP(shake_type)
-#define _shake_flag           MY_AP(shake_flag)
-#define _xshake               MY_AP(xshake)
-#define _dtfsq                MY_AP(dtfsq)
-#define _bond_distance        MY_AP(bond_distance)
-#define _angle_distance       MY_AP(angle_distance)
-#define _max_iter			  MY_AP(max_iter)
-#define _tolerance			  MY_AP(tolerance)
-__device__ __constant__ int* _shake_atom;
-__device__ __constant__ int* _shake_type;
-__device__ __constant__ int* _shake_flag;
-__device__ __constant__ X_CFLOAT3* _xshake;
-__device__ __constant__ F_CFLOAT _dtfsq;
-__device__ __constant__ X_CFLOAT* _bond_distance;
-__device__ __constant__ X_CFLOAT* _angle_distance;
-__device__ __constant__ int _max_iter;
-__device__ __constant__ X_CFLOAT _tolerance;
-
-#include "fix_shake_cuda_kernel.cu"
-
-void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(x)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(map_array), & sdata->atom.map_array .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(vatom)   , & sdata->atom.vatom.dev_data, sizeof(ENERGY_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(debugdata), & sdata->debugdata         , sizeof(int*));
-}
-
-void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(periodicity), sdata->domain.periodicity		, sizeof(int) * 3);
-  cudaMemcpyToSymbol(MY_AP(prd)		, sdata->domain.prd				, sizeof(X_CFLOAT) * 3);
-  cudaMemcpyToSymbol(MY_AP(triclinic)  , &sdata->domain.triclinic		, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(h)			, sdata->domain.h				, sizeof(X_CFLOAT) * 6);
-}
-
-void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata, int size)
-{
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-    CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer) , & sdata->buffer, sizeof(int*));
-}
-
-void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
-                            void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
-                            void* bond_distance, void* angle_distance, void* virial,
-                            int max_iter, X_CFLOAT tolerance)
-{
-  Cuda_FixShakeCuda_UpdateNmax(sdata);
-  Cuda_FixShakeCuda_UpdateDomain(sdata);
-  cudaMemcpyToSymbol(MY_AP(shake_atom)        , & shake_atom 	  , sizeof(void*));
-  cudaMemcpyToSymbol(MY_AP(shake_type)        , & shake_type 	  , sizeof(void*));
-  cudaMemcpyToSymbol(MY_AP(shake_flag)        , & shake_flag 	  , sizeof(void*));
-  cudaMemcpyToSymbol(MY_AP(xshake)            , & xshake     	  , sizeof(void*));
-  cudaMemcpyToSymbol(MY_AP(dtv)               , & dtv        	  , sizeof(X_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(dtfsq)             , & dtfsq      	  , sizeof(F_CFLOAT));
-  cudaMemcpyToSymbol(MY_AP(bond_distance)     , & bond_distance  , sizeof(void*));
-  cudaMemcpyToSymbol(MY_AP(angle_distance)    , & angle_distance , sizeof(void*));
-  cudaMemcpyToSymbol(MY_AP(virial)     	   , & virial  		  , sizeof(void*));
-  cudaMemcpyToSymbol(MY_AP(flag)  			   , &sdata->flag	  , sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(max_iter)  		   , &max_iter  	  , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(tolerance)  	   , &tolerance  	  , sizeof(X_CFLOAT));
-
-  if(sdata->atom.mass_host)
-    cudaMemcpyToSymbol(MY_AP(mass), & sdata->atom.mass.dev_data , sizeof(V_CFLOAT*));
-
-  cudaMemcpyToSymbol(MY_AP(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int));       //
-
-  cudaMemcpyToSymbol(MY_AP(flag)  , &sdata->flag, sizeof(int*));
-
-}
-
-void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixShakeCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  if(sdata->buffer_new)
-    Cuda_FixShakeCuda_UpdateBuffer(sdata, 10 * sizeof(double));
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  FixShakeCuda_UnconstrainedUpdate_Kernel <<< grid, threads>>> ();
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed");
-}
-
-void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixShakeCuda_UpdateNmax(sdata);
-
-  if(sdata->domain.update)
-    Cuda_FixShakeCuda_UpdateDomain(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal , sizeof(int));
-
-  int3 layout = getgrid(sdata->atom.nlocal, 6 * sizeof(ENERGY_CFLOAT), 64);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->buffer_new)
-    Cuda_FixShakeCuda_UpdateBuffer(sdata, grid.x * grid.y * 6 * sizeof(ENERGY_CFLOAT));
-
-  BindXTypeTexture(sdata);
-
-  FixShakeCuda_Shake_Kernel <<< grid, threads, 6* threads.x* sizeof(ENERGY_CFLOAT)>>> (vflag, vflag_atom, list, nlist);
-  cudaThreadSynchronize();
-
-  CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");
-
-  if(vflag) {
-    int n = grid.x * grid.y;
-    grid.x = 6;
-    grid.y = 1;
-    threads.x = 256;
-    MY_AP(PairVirialCompute_reduce) <<< grid, threads, threads.x* sizeof(ENERGY_CFLOAT)>>>(n);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
-  }
-
-}
-
-int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixShakeCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
-
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    cudaMemset(sdata->flag, 0, sizeof(int));
-    FixShakeCuda_PackComm_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz);
-    cudaThreadSynchronize();
-    cudaMemcpy(buf_send, sdata->buffer, n * 3 * sizeof(X_CFLOAT), cudaMemcpyDeviceToHost);
-    int aflag;
-    cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-
-    if(aflag != 0) printf("aflag PackComm: %i\n", aflag);
-    CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed");
-
-  }
-
-  return 3 * n;
-}
-
-int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixShakeCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
-
-  static int count = -1;
-  count++;
-  X_CFLOAT dx = 0.0;
-  X_CFLOAT dy = 0.0;
-  X_CFLOAT dz = 0.0;
-
-  if(pbc_flag != 0) {
-    if(sdata->domain.triclinic == 0) {
-      dx = pbc[0] * sdata->domain.prd[0];
-      dy = pbc[1] * sdata->domain.prd[1];
-      dz = pbc[2] * sdata->domain.prd[2];
-    } else {
-      dx = pbc[0] * sdata->domain.prd[0] + pbc[5] * sdata->domain.xy + pbc[4] * sdata->domain.xz;
-      dy = pbc[1] * sdata->domain.prd[1] + pbc[3] * sdata->domain.yz;
-      dz = pbc[2] * sdata->domain.prd[2];
-    }
-  }
-
-
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    FixShakeCuda_PackComm_Self_Kernel <<< grid, threads, 0>>>((int*) sdata->comm.sendlist.dev_data, n, sdata->comm.maxlistlength, iswap, dx, dy, dz, first);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
-  }
-
-  return 3 * n;
-}
-
-void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixShakeCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int size = n * 3 * sizeof(X_CFLOAT);
-
-  if(sdata->buffer_new or (size > sdata->buffersize))
-    Cuda_FixShakeCuda_UpdateBuffer(sdata, size);
-
-  int3 layout = getgrid(n);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  if(sdata->atom.nlocal > 0) {
-    cudaMemcpy(sdata->buffer, (void*)buf_recv, n * 3 * sizeof(X_CFLOAT), cudaMemcpyHostToDevice);
-    FixShakeCuda_UnpackComm_Kernel <<< grid, threads, 0>>>(n, first);
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
-
-  }
-}
--- a/lib/cuda/fix_shake_cuda_cu.h
+++ b/lib/cuda/fix_shake_cuda_cu.h
@ -1,34 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata, X_CFLOAT dtv, F_CFLOAT dtfsq,
-                                       void* shake_flag, void* shake_atom, void* shake_type, void* xshake,
-                                       void* bond_distance, void* angle_distance, void* virial,
-                                       int max_iter, X_CFLOAT tolerance);
-extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
-extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata, int vflag, int vflag_atom, int* list, int nlist);
-extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata, int n, int iswap, void* buf_send, int* pbc, int pbc_flag);
-extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata, int n, int iswap, int first, int* pbc, int pbc_flag);
-extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata, int n, int first, void* buf_recv);
-
--- a/lib/cuda/fix_shake_cuda_kernel.cu
+++ b/lib/cuda/fix_shake_cuda_kernel.cu
--- a/lib/cuda/fix_temp_berendsen_cuda.cu
+++ b/lib/cuda/fix_temp_berendsen_cuda.cu
@ -1,66 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_temp_berendsen_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_temp_berendsen_cuda_cu.h"
-#include "fix_temp_berendsen_cuda_kernel.cu"
-
-
-void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_CFLOAT*));
-}
-
-void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
-
-}
-
-
-void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
-{
-  V_CFLOAT factor = afactor;
-
-  if(sdata->atom.update_nmax)
-    Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixTempBerendsenCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_temp_berendsen_cuda_cu.h
+++ b/lib/cuda/fix_temp_berendsen_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);
--- a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
+++ b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
@ -1,37 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-
-__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      _v[i] *= factor;
-      _v[i + _nmax] *= factor;
-      _v[i + 2 * _nmax] *= factor;
-    }
-}
-
--- a/lib/cuda/fix_temp_rescale_cuda.cu
+++ b/lib/cuda/fix_temp_rescale_cuda.cu
@ -1,64 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_temp_rescale_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_temp_rescale_cuda_cu.h"
-#include "fix_temp_rescale_cuda_kernel.cu"
-
-
-void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_CFLOAT*));
-}
-
-void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
-
-}
-
-
-void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor)
-{
-  V_CFLOAT factor = afactor;
-  //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
-  Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  //cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixTempRescaleCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_temp_rescale_cuda_cu.h
+++ b/lib/cuda/fix_temp_rescale_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor);
--- a/lib/cuda/fix_temp_rescale_cuda_kernel.cu
+++ b/lib/cuda/fix_temp_rescale_cuda_kernel.cu
@ -1,37 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-
-__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      _v[i] *= factor;
-      _v[i + _nmax] *= factor;
-      _v[i + 2 * _nmax] *= factor;
-    }
-}
-
--- a/lib/cuda/fix_temp_rescale_limit_cuda.cu
+++ b/lib/cuda/fix_temp_rescale_limit_cuda.cu
@ -1,64 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_temp_rescale_limit_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_temp_rescale_limit_cuda_cu.h"
-#include "fix_temp_rescale_limit_cuda_kernel.cu"
-
-
-void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.v    .dev_data, sizeof(X_CFLOAT*));
-}
-
-void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
-
-}
-
-
-void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit)
-{
-  V_CFLOAT factor = afactor;
-  //if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
-  Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
-  //if(sdata->atom.update_nlocal)
-  //cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-
-  int3 layout = getgrid(sdata->atom.nlocal);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel <<< grid, threads, 0>>> (groupbit, factor, limit);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit, double afactor, double limit);
--- a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
@ -1,44 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-
-__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit, V_CFLOAT factor, V_CFLOAT limit)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      V_CFLOAT vx = _v[i];
-      V_CFLOAT vy = _v[i + _nmax];
-      V_CFLOAT vz = _v[i + 2 * _nmax];
-      vx *= factor;
-      vy *= factor;
-      vz *= factor;
-
-      _v[i] = vx > 0 ? min(vx, limit) : max(vx, -limit);
-      _v[i + _nmax] = vy > 0 ? min(vy, limit) : max(vy, -limit);
-      _v[i + 2 * _nmax] = vz > 0 ? min(vz, limit) : max(vz, -limit);
-    }
-}
-
--- a/lib/cuda/fix_viscous_cuda.cu
+++ b/lib/cuda/fix_viscous_cuda.cu
@ -1,67 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_viscous_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_viscous_cuda_cu.h"
-#include "fix_viscous_cuda_kernel.cu"
-
-void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-  cudaMemcpyToSymbol(MY_AP(mask)    , & sdata->atom.mask .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nmax)    , & sdata->atom.nmax          , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(v)       , & sdata->atom.x    .dev_data, sizeof(X_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(f)       , & sdata->atom.f    .dev_data, sizeof(F_CFLOAT*));
-  cudaMemcpyToSymbol(MY_AP(type)    , & sdata->atom.type .dev_data, sizeof(int*));
-}
-
-void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_FixViscousCuda_UpdateNmax(sdata);
-
-}
-
-
-void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma)
-{
-  if(sdata->atom.update_nmax)
-    Cuda_FixViscousCuda_UpdateNmax(sdata);
-
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbol(MY_AP(nlocal)  , & sdata->atom.nlocal        , sizeof(int));
-
-
-  int3 layout = getgrid(sdata->atom.nlocal, 0);
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  Cuda_FixViscousCuda_PostForce_Kernel <<< grid, threads, 0>>> (groupbit, (F_CFLOAT*) gamma);
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
-
-}
--- a/lib/cuda/fix_viscous_cuda_cu.h
+++ b/lib/cuda/fix_viscous_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit, void* gamma);
--- a/lib/cuda/fix_viscous_cuda_kernel.cu
+++ b/lib/cuda/fix_viscous_cuda_kernel.cu
@ -1,35 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit, F_CFLOAT* gamma)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  if(i < _nlocal)
-    if(_mask[i] & groupbit) {
-      F_CFLOAT drag = gamma[_type[i]];
-      _f[i] -= drag * _v[i];
-      _f[i + 1 * _nmax] -= drag * _v[i + 1 * _nmax];
-      _f[i + 2 * _nmax] -= drag * _v[i + 2 * _nmax];
-    }
-}
--- a/lib/cuda/neighbor.cu
+++ b/lib/cuda/neighbor.cu
@ -1,364 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <time.h>
-#define MY_PREFIX neighbor
-#define IncludeCommonNeigh
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "cuda_wrapper_cu.h"
-
-#define _cutneighsq     MY_AP(cutneighsq)
-#define _ex_type     	MY_AP(ex_type)
-#define _nex_type     	MY_AP(nex_type)
-#define _ex1_bit     	MY_AP(ex1_bit)
-#define _ex2_bit     	MY_AP(ex2_bit)
-#define _nex_group     	MY_AP(nex_group)
-#define _ex_mol_bit     MY_AP(ex_mol_bit)
-#define _nex_mol     	MY_AP(nex_mol)
-__device__ __constant__ CUDA_CFLOAT* _cutneighsq;
-__device__ __constant__ int* _ex_type;
-__device__ __constant__ int _nex_type;
-__device__ __constant__ int* _ex1_bit;
-__device__ __constant__ int* _ex2_bit;
-__device__ __constant__ int _nex_group;
-__device__ __constant__ int* _ex_mol_bit;
-__device__ __constant__ int _nex_mol;
-
-#include "neighbor_cu.h"
-#include "neighbor_kernel.cu"
-
-void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-  CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
-
-  int size = (unsigned)(sizeof(int) * 20 + sneighlist->bin_dim[0] * sneighlist->bin_dim[1] * sneighlist->bin_dim[2] * (sizeof(int) + sneighlist->bin_nmax * 3 * sizeof(CUDA_CFLOAT)));
-
-  if(sdata->buffersize < size) {
-    MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-
-    if(sdata->buffer != NULL) CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-
-    sdata->buffer = CudaWrapper_AllocCudaData(size);
-    sdata->buffersize = size;
-    sdata->buffer_new++;
-    MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-  }
-
-  cudaMemcpyToSymbol(MY_AP(buffer), & sdata->buffer, sizeof(int*));
-  CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
-}
-
-int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-  if(sdata->buffer_new)
-    Cuda_Neighbor_UpdateBuffer(sdata, sneighlist);
-
-  // initialize only on first call
-  CUDA_CFLOAT rez_bin_size[3] = {
-    (1.0 * sneighlist->bin_dim[0] - 4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
-    (1.0 * sneighlist->bin_dim[1] - 4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
-    (1.0 * sneighlist->bin_dim[2] - 4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
-  };
-
-  short init = 0;
-
-  if(! init) {
-    init = 0;
-    cudaMemcpyToSymbol(MY_AP(x)              , & sdata->atom.x         .dev_data, sizeof(X_CFLOAT*));
-    cudaMemcpyToSymbol(MY_AP(nall)         , & sdata->atom.nall                    , sizeof(unsigned));
-    cudaMemcpyToSymbol(MY_AP(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned));
-    cudaMemcpyToSymbol(MY_AP(sublo)          ,   sdata->domain.sublo                 , sizeof(X_CFLOAT) * 3);
-  }
-
-
-  int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  my_times starttime, endtime;
-  my_gettime(CLOCK_REALTIME, &starttime);
-
-  cudaMemset((int*)(sdata->buffer), 0, sizeof(int) * (20 + (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2])) + 3 * sizeof(CUDA_CFLOAT) * (sneighlist->bin_dim[0]) * (sneighlist->bin_dim[1]) * (sneighlist->bin_dim[2]) * (sneighlist->bin_nmax));
-
-  Binning_Kernel <<< grid, threads>>> (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], sneighlist->bin_dim[2], rez_bin_size[0], rez_bin_size[1], rez_bin_size[2]);
-  cudaThreadSynchronize();
-
-  my_gettime(CLOCK_REALTIME, &endtime);
-  sdata->cuda_timings.neigh_bin +=
-    endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-
-
-  int binning_error;
-  cudaMemcpy((void*) &binning_error, (void*) sdata->buffer, 1 * sizeof(int), cudaMemcpyDeviceToHost);
-
-  if(binning_error) {
-    sneighlist->bin_extraspace += 0.05;
-  } else {
-    MYDBG(printf("CUDA: binning successful\n");)
-  }
-  CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
-  return binning_error;
-}
-
-int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-  //Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
-  CUDA_CFLOAT globcutoff = -1.0;
-
-  short init = 0;
-
-  if(! init) {
-    init = 1;
-
-    // !! LAMMPS indexes atom types starting with 1 !!
-
-    unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-
-    unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
-
-    CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
-    //printf("Allocate: %i\n",nx);
-    sneighlist->cu_cutneighsq = (CUDA_CFLOAT*) CudaWrapper_AllocCudaData(nx);
-
-    if(sneighlist->cutneighsq) {
-      int cutoffsdiffer = 0;
-      double cutoff0 = sneighlist->cutneighsq[1][1];
-
-      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
-        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
-          acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
-
-          if((sneighlist->cutneighsq[i][j] - cutoff0) * (sneighlist->cutneighsq[i][j] - cutoff0) > 1e-6) cutoffsdiffer++;
-        }
-      }
-
-      if(not cutoffsdiffer) globcutoff = (CUDA_CFLOAT) cutoff0;
-    } else {
-      MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
-      return 0;
-    }
-
-    int size = 100;
-
-    if(sdata->buffersize < size) {
-      MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-      CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-      sdata->buffer = CudaWrapper_AllocCudaData(size);
-      sdata->buffersize = size;
-      sdata->buffer_new++;
-      MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-    }
-
-    CudaWrapper_UploadCudaData(acutneighsq, sneighlist->cu_cutneighsq, nx);
-    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , &sneighlist->cu_cutneighsq       , sizeof(CUDA_CFLOAT*));
-
-    cudaMemcpyToSymbol(MY_AP(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned));
-    cudaMemcpyToSymbol(MY_AP(special_flag)     , sdata->atom.special_flag         , 4 * sizeof(int));
-    cudaMemcpyToSymbol(MY_AP(molecular)        , & sdata->atom.molecular          , sizeof(int));
-  }
-
-  cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
-  //cudaMemcpyToSymbol(MY_AP(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*)     );
-  cudaMemcpyToSymbol(MY_AP(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(inum)             , & sneighlist->inum               , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nlocal)           , & sdata->atom.nlocal             , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nall)             , & sdata->atom.nall            , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(type)             , & sdata->atom.type      .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(mask)             , & sdata->atom.mask      .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(tag)              , & sdata->atom.tag       .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(special)          , & sdata->atom.special   .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(maxspecial)       , & sdata->atom.maxspecial         , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nspecial)         , & sdata->atom.nspecial  .dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(debugdata)        , & sdata->debugdata	 , sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(overlap_comm)     , & sdata->overlap_comm, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(neighbors) 		  , & sneighlist->neighbors.dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(ex_type) 		  , & sneighlist->ex_type.dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(ex1_bit) 		  , & sneighlist->ex1_bit.dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(ex2_bit) 		  , & sneighlist->ex2_bit.dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(ex_mol_bit) 	  , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
-  cudaMemcpyToSymbol(MY_AP(nex_type)     	  , & sneighlist->nex_type, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nex_group)     	  , & sneighlist->nex_group, sizeof(int));
-  cudaMemcpyToSymbol(MY_AP(nex_mol)     	  , & sneighlist->nex_mol, sizeof(int));
-
-  if(sdata->overlap_comm) {
-    cudaMemcpyToSymbol(MY_AP(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*));
-  }
-
-  //dim3 threads(sneighlist->bin_nmax,1,1);
-  dim3 threads(MIN(128, sneighlist->bin_nmax), 1, 1);
-  dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1], sneighlist->bin_dim[2], 1);
-
-  //printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_CFLOAT))*threads.x,sneighlist->bin_nmax);
-  int buffer[20];
-  buffer[0] = 1;
-  buffer[1] = 0;
-  CudaWrapper_UploadCudaData(buffer, sdata->buffer, 2 * sizeof(int));
-  CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
-  //cudaMemset(sdata->debugdata,0,100*sizeof(int));
-  unsigned int shared_size = (sizeof(int) + 3 * sizeof(CUDA_CFLOAT)) * threads.x;
-  MYDBG(printf("Configuration: %i %i %i %u %i\n", grid.x, grid.y, threads.x, shared_size, sneighlist->bin_nmax);)
-  //shared_size=2056;
-  my_times starttime, endtime;
-  my_gettime(CLOCK_REALTIME, &starttime);
-  //for(int i=0;i<100;i++)
-  {
-    if(sdata->overlap_comm)
-      NeighborBuildFullBin_OverlapComm_Kernel <<< grid, threads, shared_size>>>
-      (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom);
-    else {
-      int exclude = sneighlist->nex_mol | sneighlist->nex_group | sneighlist->nex_type;
-
-      if(exclude)
-        NeighborBuildFullBin_Kernel<1> <<< grid, threads, shared_size>>>
-        (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
-      else
-        NeighborBuildFullBin_Kernel<0> <<< grid, threads, shared_size>>>
-        (sneighlist->binned_id, sneighlist->bin_nmax, sneighlist->bin_dim[0], sneighlist->bin_dim[1], globcutoff, sdata->pair.use_block_per_atom, sdata->pair.neighall);
-    }
-    //NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_CFLOAT))*threads.x+sizeof(int)>>>
-    //	(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
-
-    cudaThreadSynchronize();
-    CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
-    my_gettime(CLOCK_REALTIME, &endtime);
-    sdata->cuda_timings.neigh_build +=
-      endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-    //dim3 threads,grid;
-    CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));
-
-    if(buffer[0] >= 0 && true && sdata->atom.molecular) {
-      //printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
-      my_gettime(CLOCK_REALTIME, &starttime);
-      int3 layout = getgrid(sdata->atom.nlocal, 0, 512);
-      threads.x = layout.z;
-      threads.y = 1;
-      threads.z = 1;
-      grid.x = layout.x;
-      grid.y = layout.y;
-      grid.z = 1;
-      FindSpecial <<< grid, threads>>>(sdata->pair.use_block_per_atom);
-      cudaThreadSynchronize();
-      CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
-      my_gettime(CLOCK_REALTIME, &endtime);
-      sdata->cuda_timings.neigh_special +=
-        endtime.tv_sec - starttime.tv_sec + 1.0 * (endtime.tv_nsec - starttime.tv_nsec) / 1000000000;
-    }
-  }
-  //printf("Neightime: %lf\n",sdata->cuda_timings.test1);
-  CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
-
-  //CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));
-
-  MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
-  return buffer[0];
-}
-
-int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-  MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
-  // initialize only on first call
-  /*static*/ short init = 0;
-
-  if(! init) {
-    init = 1;
-
-    // !! LAMMPS indexes atom types starting with 1 !!
-
-    unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-
-    if(cuda_ntypes * cuda_ntypes > CUDA_MAX_TYPES2)
-      printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
-             "(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
-             "or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
-
-    unsigned nx = sizeof(CUDA_CFLOAT) * cuda_ntypes * cuda_ntypes;
-    CUDA_CFLOAT* acutneighsq = (CUDA_CFLOAT*) malloc(nx);
-
-    if(sneighlist->cutneighsq) {
-      for(int i = 1; i <= sdata->atom.ntypes; ++i) {
-        for(int j = 1; j <= sdata->atom.ntypes; ++j) {
-          acutneighsq[i * cuda_ntypes + j] = (CUDA_CFLOAT)(sneighlist->cutneighsq[i][j]);
-          //printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
-        }
-      }
-    } else {
-      MYEMUDBG(printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n");)
-      return 0;
-    }
-
-    int size = 100;
-
-    if(sdata->buffersize < size) {
-      MYDBG(printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize);)
-      CudaWrapper_FreeCudaData(sdata->buffer, sdata->buffersize);
-      sdata->buffer = CudaWrapper_AllocCudaData(size);
-      sdata->buffersize = size;
-      sdata->buffer_new++;
-      MYDBG(printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize);)
-    }
-
-    cudaMemcpyToSymbol(MY_AP(buffer)           , & sdata->buffer                  , sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned));
-    cudaMemcpyToSymbol(MY_AP(cutneighsq)       , acutneighsq                    , nx);
-    cudaMemcpyToSymbol(MY_AP(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned));
-    cudaMemcpyToSymbol(MY_AP(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(inum)             , & sneighlist->inum               , sizeof(int));
-    cudaMemcpyToSymbol(MY_AP(nlocal)           , & sdata->atom.nlocal             , sizeof(int));
-    cudaMemcpyToSymbol(MY_AP(nall)             , & sdata->atom.nall               , sizeof(int));
-    cudaMemcpyToSymbol(MY_AP(nmax)             , & sdata->atom.nmax               , sizeof(int));
-    cudaMemcpyToSymbol(MY_AP(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(type)             , & sdata->atom.type      .dev_data, sizeof(int*));
-    cudaMemcpyToSymbol(MY_AP(x)                , & sdata->atom.x         .dev_data, sizeof(X_CFLOAT*));
-    cudaMemcpyToSymbol(MY_AP(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int));
-
-    free(acutneighsq);
-  }
-
-  int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
-  dim3 threads(layout.z, 1, 1);
-  dim3 grid(layout.x, layout.y, 1);
-
-  int return_value = 1;
-  CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));
-
-  CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
-  NeighborBuildFullNsq_Kernel <<< grid, threads>>> ();
-  cudaThreadSynchronize();
-  CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
-
-  int buffer[20];
-  CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int) * 20);
-  MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
-  return return_value = buffer[0];
-}
--- a/lib/cuda/neighbor_cu.h
+++ b/lib/cuda/neighbor_cu.h
@ -1,32 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef NEIGHBOR_CU_H_
-#define NEIGHBOR_CU_H_
-#include "cuda_shared.h"
-
-extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
-extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
-extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
-
-#endif /*NEIGHBOR_CU_H_*/
--- a/lib/cuda/neighbor_kernel.cu
+++ b/lib/cuda/neighbor_kernel.cu
@ -1,660 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#define SBBITS 30
-
-__global__ void Binning_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, int bin_dim_z,
-                               CUDA_CFLOAT rez_bin_size_x, CUDA_CFLOAT rez_bin_size_y, CUDA_CFLOAT rez_bin_size_z)
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-
-  /*int* bin_count=(int*) _buffer;
-  bin_count=bin_count+20;
-  CUDA_CFLOAT* binned_x=(CUDA_CFLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
-  CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
-  binned_x = &binned_x[2];
-  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
-
-  if(i < _nall) {
-    // copy atom position from global device memory to local register
-    // in this 3 steps to get as much coalesced access as possible
-    X_CFLOAT* my_x = _x + i;
-    CUDA_CFLOAT x_i = *my_x;
-    my_x += _nmax;
-    CUDA_CFLOAT y_i = *my_x;
-    my_x += _nmax;
-    CUDA_CFLOAT z_i = *my_x;
-
-
-    // calculate flat bin index
-    int bx = __float2int_rd(rez_bin_size_x * (x_i - _sublo[0])) + 2;
-    int by = __float2int_rd(rez_bin_size_y * (y_i - _sublo[1])) + 2;
-    int bz = __float2int_rd(rez_bin_size_z * (z_i - _sublo[2])) + 2;
-
-    bx -= bx * negativCUDA(1.0f * bx);
-    bx -= (bx - bin_dim_x + 1) * negativCUDA(1.0f * bin_dim_x - 1.0f - 1.0f * bx);
-    by -= by * negativCUDA(1.0f * by);
-    by -= (by - bin_dim_y + 1) * negativCUDA(1.0f * bin_dim_y - 1.0f - 1.0f * by);
-    bz -= bz * negativCUDA(1.0f * bz);
-    bz -= (bz - bin_dim_z + 1) * negativCUDA(1.0f * bin_dim_z - 1.0f - 1.0f * bz);
-
-
-    const unsigned j = bin_dim_z * (bin_dim_y * bx + by) + bz;
-
-    // add new atom to bin, get bin-array position
-    const unsigned k = atomicAdd(& bin_count[j], 1);
-
-    if(k < bin_nmax) {
-      binned_id [bin_nmax * j + k] = i;
-      binned_x [3 * bin_nmax * j + k] = x_i;
-      binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
-      binned_x [3 * bin_nmax * j + k + 2 * bin_nmax] = z_i;
-    } else {
-      // normally, this should not happen:
-      int errorn = atomicAdd((int*) _buffer, 1);
-      MYEMUDBG(printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j);)
-    }
-  }
-}
-
-
-__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
-{
-  int m;
-
-  if(_nex_type)
-    if(_ex_type[itype * _cuda_ntypes + jtype]) return 1;
-
-  if(_nex_group) {
-    for(m = 0; m < _nex_group; m++) {
-      if(_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
-
-      if(_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
-    }
-  }
-
-  if(_nex_mol) {
-    if(_molecule[i] == _molecule[j])
-      for(m = 0; m < _nex_mol; m++)
-        if(_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m]) return 1;
-  }
-
-  return 0;
-}
-
-extern __shared__ CUDA_CFLOAT shared[];
-
-__device__ inline int find_special(int3 &n, int* list, int &tag, int3 flag)
-{
-  int k = n.z;
-
-  for(int l = 0; l < n.z; l++) k = ((list[l] == tag) ? l : k);
-
-  return k < n.x ? flag.x : (k < n.y ? flag.y : (k < n.z ? flag.z : 0));
-}
-
-template <const unsigned int exclude>
-__global__ void NeighborBuildFullBin_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style, bool neighall)
-{
-  int natoms = neighall ? _nall : _nlocal;
-  //const bool domol=false;
-  int bin_dim_z = gridDim.y;
-  CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
-  binned_x = &binned_x[2];
-  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
-  int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
-  int bin_x = blockIdx.x / bin_dim_y;
-  int bin_y = blockIdx.x - bin_x * bin_dim_y;
-  int bin_z = blockIdx.y;
-  int bin_c = bin_count[bin];
-
-
-  CUDA_CFLOAT cut;
-
-  if(globcutoff > 0)
-    cut = globcutoff;
-
-  int i = _nall;
-  CUDA_CFLOAT* my_x;
-  CUDA_CFLOAT x_i, y_i, z_i;
-
-  for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
-
-    int actIdx = threadIdx.x + actOffset;
-    CUDA_CFLOAT* other_x = shared;
-    int* other_id = (int*) &other_x[3 * blockDim.x];
-
-    if(actIdx < bin_c) {
-      i = binned_id[__mul24(bin, bin_nmax) + actIdx];
-      my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
-      x_i = *my_x;
-      my_x += bin_nmax;
-      y_i = *my_x;
-      my_x += bin_nmax;
-      z_i = *my_x;
-    } else
-      i = 2 * _nall;
-
-    __syncthreads();
-
-    int jnum = 0;
-    int itype;
-
-    if(i < natoms) {
-      jnum = 0;
-      _ilist[i] = i;
-      itype = _type[i];
-    }
-
-    //__syncthreads();
-
-
-    for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
-      int otherActIdx = threadIdx.x + otherActOffset;
-
-      if(otherActIdx < bin_c) {
-        if(otherActOffset == actOffset) {
-          other_id[threadIdx.x] = i;
-          other_x[threadIdx.x] = x_i;
-          other_x[threadIdx.x + blockDim.x] = y_i;
-          other_x[threadIdx.x + 2 * blockDim.x] = z_i;
-        } else {
-          other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
-          my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
-          other_x[threadIdx.x] = *my_x;
-          my_x += bin_nmax;
-          other_x[threadIdx.x + blockDim.x] = *my_x;
-          my_x += bin_nmax;
-          other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
-
-        }
-      }
-
-      __syncthreads();
-      int kk = threadIdx.x;
-
-      for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
-        if(i < natoms) {
-          kk++;
-          kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
-          int j = other_id[kk];
-
-          if(exclude && exclusion(i, j, itype, _type[j])) continue;
-
-          if(globcutoff < 0) {
-            int jtype = _type[j];
-            cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-          }
-
-          CUDA_CFLOAT delx = x_i - other_x[kk];
-          CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
-          CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
-          CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
-
-
-          if(rsq <= cut && i != j) {
-            if(jnum < _maxneighbors) {
-              if(block_style)
-                _neighbors[i * _maxneighbors + jnum] = j;
-              else
-                _neighbors[i + jnum * natoms] = j;
-            }
-
-            ++jnum;
-          }
-        }
-      }
-
-      __syncthreads();
-
-    }
-
-    for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
-      for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
-        for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
-          if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
-
-          if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
-
-          int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
-
-          if(other_bin == bin) continue;
-
-          int obin_c = bin_count[other_bin];
-
-          for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
-            int otherActIdx = otherActOffset + threadIdx.x;
-
-            if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
-              other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
-              my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
-              other_x[threadIdx.x] = *my_x;
-              my_x += bin_nmax;
-              other_x[threadIdx.x + blockDim.x] = *my_x;
-              my_x += bin_nmax;
-              other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
-            }
-
-            __syncthreads();
-
-            for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
-              if(i < natoms) {
-                int j = other_id[k];
-
-                if(exclude && exclusion(i, j, itype, _type[j])) continue;
-
-                if(globcutoff < 0) {
-                  int jtype = _type[j];
-                  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-                }
-
-                CUDA_CFLOAT delx = x_i - other_x[k];
-                CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
-                CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
-                CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
-
-                if(rsq <= cut && i != j) {
-                  if(jnum < _maxneighbors) {
-                    if(block_style)
-                      _neighbors[i * _maxneighbors + jnum] = j;
-                    else
-                      _neighbors[i + jnum * natoms] = j;
-                  }
-
-                  ++jnum;
-                }
-              }
-            }
-
-            __syncthreads();
-
-          }
-        }
-
-    if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
-
-    if(i < natoms)
-      _numneigh[i] = jnum;
-  }
-}
-
-
-__global__ void FindSpecial(int block_style)
-{
-  int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int which;
-  int tag_mask = 0;
-  int3 spec_flag;
-
-  int3 mynspecial = {0, 0, 1};
-
-  if(ii >= _nlocal) return;
-
-  int special_id[CUDA_MAX_NSPECIAL];
-
-  int i = _ilist[ii];
-
-  if(i >= _nlocal) return;
-
-  int jnum = _numneigh[i];
-
-  if(_special_flag[1] == 0) spec_flag.x = -1;
-  else if(_special_flag[1] == 1) spec_flag.x = 0;
-  else spec_flag.x = 1;
-
-  if(_special_flag[2] == 0) spec_flag.y = -1;
-  else if(_special_flag[2] == 1) spec_flag.y = 0;
-  else spec_flag.y = 2;
-
-  if(_special_flag[3] == 0) spec_flag.z = -1;
-  else if(_special_flag[3] == 1) spec_flag.z = 0;
-  else spec_flag.z = 3;
-
-  mynspecial.x = _nspecial[i];
-  mynspecial.y = _nspecial[i + _nmax];
-  mynspecial.z = _nspecial[i + 2 * _nmax];
-
-  if(i < _nlocal) {
-    int* list = &_special[i];
-
-    for(int k = 0; k < mynspecial.z; k++) {
-      special_id[k] = list[k * _nmax];
-      tag_mask = tag_mask | special_id[k];
-    }
-  }
-
-
-  for(int k = 0; k < MIN(jnum, _maxneighbors); k++) {
-    int j;
-
-    if(block_style)
-      j = _neighbors[i * _maxneighbors + k];
-    else
-      j = _neighbors[i + k * _nlocal];
-
-    int tag_j = _tag[j];
-    which = 0;
-
-    if((tag_mask & tag_j) == tag_j) {
-      which = find_special(mynspecial, special_id, tag_j, spec_flag);
-
-      if(which > 0) {
-        if(block_style)
-          _neighbors[i * _maxneighbors + k] = j ^ (which << SBBITS);
-        else
-          _neighbors[i + k * _nlocal] = j ^ (which << SBBITS);
-      } else if(which < 0) {
-        if(block_style)
-          _neighbors[i * _maxneighbors + k] = _neighbors[i * _maxneighbors + jnum - 1];
-        else
-          _neighbors[i + k * _nlocal] = _neighbors[i + (jnum - 1) * _nlocal];
-
-        jnum--;
-        k--;
-      }
-    }
-  }
-
-  _numneigh[i] = jnum;
-}
-
-__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id, int bin_nmax, int bin_dim_x, int bin_dim_y, CUDA_CFLOAT globcutoff, int block_style)
-{
-  int bin_dim_z = gridDim.y;
-  CUDA_CFLOAT* binned_x = (CUDA_CFLOAT*) _buffer;
-  binned_x = &binned_x[2];
-  int* bin_count = (int*) &binned_x[3 * bin_dim_x * bin_dim_y * bin_dim_z * bin_nmax];
-  int bin = __mul24(gridDim.y, blockIdx.x) + blockIdx.y;
-  int bin_x = blockIdx.x / bin_dim_y;
-  int bin_y = blockIdx.x - bin_x * bin_dim_y;
-  int bin_z = blockIdx.y;
-  int bin_c = bin_count[bin];
-
-
-  CUDA_CFLOAT cut;
-
-  if(globcutoff > 0)
-    cut = globcutoff;
-
-  int i = _nall;
-  CUDA_CFLOAT* my_x;
-  CUDA_CFLOAT x_i, y_i, z_i;
-
-  for(int actOffset = 0; actOffset < bin_c; actOffset += blockDim.x) {
-
-    int actIdx = threadIdx.x + actOffset;
-    CUDA_CFLOAT* other_x = shared;
-    int* other_id = (int*) &other_x[3 * blockDim.x];
-
-    if(actIdx < bin_c) {
-      i = binned_id[__mul24(bin, bin_nmax) + actIdx];
-      my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + actIdx;
-      x_i = *my_x;
-      my_x += bin_nmax;
-      y_i = *my_x;
-      my_x += bin_nmax;
-      z_i = *my_x;
-    } else
-      i = 2 * _nall;
-
-    __syncthreads();
-
-    int jnum = 0;
-    int jnum_border = 0;
-    int jnum_inner = 0;
-    int i_border = -1;
-    int itype;
-
-    if(i < _nlocal) {
-      jnum = 0;
-      _ilist[i] = i;
-      itype = _type[i];
-    }
-
-    __syncthreads();
-
-
-    for(int otherActOffset = 0; otherActOffset < bin_c; otherActOffset += blockDim.x) {
-      int otherActIdx = threadIdx.x + otherActOffset;
-
-      if(otherActIdx < bin_c) {
-        if(otherActOffset == actOffset) {
-          other_id[threadIdx.x] = i;
-          other_x[threadIdx.x] = x_i;
-          other_x[threadIdx.x + blockDim.x] = y_i;
-          other_x[threadIdx.x + 2 * blockDim.x] = z_i;
-        } else {
-          other_id[threadIdx.x] = binned_id[__mul24(bin, bin_nmax) + otherActIdx];
-          my_x = binned_x + __mul24(__mul24(bin, 3), bin_nmax) + otherActIdx;
-          other_x[threadIdx.x] = *my_x;
-          my_x += bin_nmax;
-          other_x[threadIdx.x + blockDim.x] = *my_x;
-          my_x += bin_nmax;
-          other_x[threadIdx.x + __mul24(2, blockDim.x)] = *my_x;
-
-        }
-      }
-
-      __syncthreads();
-      int kk = threadIdx.x;
-
-      for(int k = 0; k < MIN(bin_c - otherActOffset, blockDim.x); ++k) {
-        if(i < _nlocal) {
-          kk++;
-          kk = kk < MIN(bin_c - otherActOffset, blockDim.x) ? kk : 0;
-          int j = other_id[kk];
-
-          if(globcutoff < 0) {
-            int jtype = _type[j];
-            cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-          }
-
-          CUDA_CFLOAT delx = x_i - other_x[kk];
-          CUDA_CFLOAT dely = y_i - other_x[kk + blockDim.x];
-          CUDA_CFLOAT delz = z_i - other_x[kk + 2 * blockDim.x];
-          CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
-
-
-          if(rsq <= cut && i != j) {
-            if((j >= _nlocal) && (i_border < 0))
-              i_border = atomicAdd(_inum_border, 1);
-
-            if(jnum < _maxneighbors) {
-              if(block_style) {
-                _neighbors[i * _maxneighbors + jnum] = j;
-
-                if(j >= _nlocal) {
-                  _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
-                } else {
-                  _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
-                }
-              } else {
-                _neighbors[i + jnum * _nlocal] = j;
-
-                if(j >= _nlocal) {
-                  _neighbors_border[i_border + jnum_border * _nlocal] = j;
-                } else {
-                  _neighbors_inner[i + jnum_inner * _nlocal] = j;
-                }
-              }
-            }
-
-            ++jnum;
-
-            if(j >= _nlocal)
-              jnum_border++;
-            else
-              jnum_inner++;
-          }
-        }
-      }
-
-      __syncthreads();
-    }
-
-    for(int obin_x = bin_x - 1; obin_x < bin_x + 2; obin_x++)
-      for(int obin_y = bin_y - 1; obin_y < bin_y + 2; obin_y++)
-        for(int obin_z = bin_z - 1; obin_z < bin_z + 2; obin_z++) {
-          if(obin_x < 0 || obin_y < 0 || obin_z < 0) continue;
-
-          if(obin_x >= bin_dim_x || obin_y >= bin_dim_y || obin_z >= bin_dim_z) continue;
-
-          int other_bin = bin_dim_z * (bin_dim_y * obin_x + obin_y) + obin_z;
-
-          if(other_bin == bin) continue;
-
-          int obin_c = bin_count[other_bin];
-
-          for(int otherActOffset = 0; otherActOffset < obin_c; otherActOffset += blockDim.x) {
-            int otherActIdx = otherActOffset + threadIdx.x;
-
-            if(threadIdx.x < MIN(blockDim.x, obin_c - otherActOffset)) {
-              other_id[threadIdx.x] = binned_id[__mul24(other_bin, bin_nmax) + otherActIdx];
-              my_x = binned_x + __mul24(__mul24(other_bin, 3), bin_nmax) + otherActIdx;
-              other_x[threadIdx.x] = *my_x;
-              my_x += bin_nmax;
-              other_x[threadIdx.x + blockDim.x] = *my_x;
-              my_x += bin_nmax;
-              other_x[threadIdx.x + 2 * blockDim.x] = *my_x;
-            }
-
-            __syncthreads();
-
-            for(int k = 0; k < MIN(blockDim.x, obin_c - otherActOffset); ++k) {
-              if(i < _nlocal) {
-                int j = other_id[k];
-
-                if(globcutoff < 0) {
-                  int jtype = _type[j];
-                  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-                }
-
-                CUDA_CFLOAT delx = x_i - other_x[k];
-                CUDA_CFLOAT dely = y_i - other_x[k + blockDim.x];
-                CUDA_CFLOAT delz = z_i - other_x[k + 2 * blockDim.x];
-                CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
-
-                if(rsq <= cut && i != j) {
-                  if((j >= _nlocal) && (i_border < 0))
-                    i_border = atomicAdd(_inum_border, 1);
-
-                  if(jnum < _maxneighbors) {
-                    if(block_style) {
-                      _neighbors[i * _maxneighbors + jnum] = j;
-
-                      if(j >= _nlocal) {
-                        _neighbors_border[i_border * _maxneighbors + jnum_border] = j;
-                      } else {
-                        _neighbors_inner[i * _maxneighbors + jnum_inner] = j;
-                      }
-                    } else {
-                      _neighbors[i + jnum * _nlocal] = j;
-
-                      if(j >= _nlocal) {
-                        _neighbors_border[i_border + jnum_border * _nlocal] = j;
-                      } else {
-                        _neighbors_inner[i + jnum_inner * _nlocal] = j;
-                      }
-                    }
-                  }
-
-                  ++jnum;
-
-                  if(j >= _nlocal)
-                    jnum_border++;
-                  else
-                    jnum_inner++;
-                }
-              }
-            }
-
-            __syncthreads();
-          }
-        }
-
-    if(jnum > _maxneighbors)((int*)_buffer)[0] = -jnum;
-
-    if(i < _nlocal) {
-      _numneigh[i] = jnum;
-      _numneigh_inner[i] = jnum_inner;
-
-      if(i_border >= 0) _numneigh_border[i_border] = jnum_border;
-
-      if(i_border >= 0) _ilist_border[i_border] = i;
-
-    }
-  }
-}
-
-__global__ void NeighborBuildFullNsq_Kernel()
-{
-  int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  int* buffer = (int*) _buffer;
-
-  if(i < _nlocal) {
-    X_CFLOAT* my_x = _x + i;
-    CUDA_CFLOAT x_i = *my_x;
-    my_x += _nmax;
-    CUDA_CFLOAT y_i = *my_x;
-    my_x += _nmax;
-    CUDA_CFLOAT z_i = *my_x;
-    int jnum = 0;
-    int* jlist = _firstneigh[i];
-    _ilist[i] = i;
-
-    int itype = _type[i];
-    __syncthreads();
-
-    for(int j = 0; j < _nall; ++j) {
-      my_x = _x + j;
-      CUDA_CFLOAT x_j = *my_x;
-      my_x += _nmax;
-      CUDA_CFLOAT y_j = *my_x;
-      my_x += _nmax;
-      CUDA_CFLOAT z_j = *my_x;
-      CUDA_CFLOAT delx = x_i - x_j;
-      CUDA_CFLOAT dely = y_i - y_j;
-      CUDA_CFLOAT delz = z_i - z_j;
-      CUDA_CFLOAT rsq = delx * delx + dely * dely + delz * delz;
-      int jtype = _type[j];
-
-      if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j) {
-        if(jnum < _maxneighbors)
-          jlist[jnum] = j;
-
-        if(i == 151)((int*)_buffer)[jnum + 2] = j;
-
-        ++jnum;
-      }
-
-      __syncthreads();
-    }
-
-    if(jnum > _maxneighbors) buffer[0] = 0;
-
-    _numneigh[i] = jnum;
-
-    if(i == 151)((int*)_buffer)[1] = jnum;
-  }
-}
-
--- a/lib/cuda/pair_born_coul_long_cuda.cu
+++ b/lib/cuda/pair_born_coul_long_cuda.cu
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-
-#define _rhoinv MY_AP(coeff1)
-#define _sigma MY_AP(coeff2)
-#define _a MY_AP(coeff3)
-#define _c MY_AP(coeff4)
-#define _d MY_AP(coeff5)
-
-#include "pair_born_coul_long_cuda_cu.h"
-#include "pair_born_coul_long_cuda_kernel_nc.cu"
-
-#include <time.h>
-
-void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_Pair_Init_AllStyles(sdata, 5, true);
-}
-
-void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
-{
-
-
-  static  short init = 0;
-
-  if(! init) {
-    init = 1;
-    Cuda_PairBornCoulLongCuda_Init(sdata);
-  }
-
-  dim3 grid, threads;
-  int sharedperproc;
-
-
-  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
-
-  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
-
-  if(sdata->pair.use_block_per_atom)
-    Pair_Kernel_BpA<PAIR_BORN, COUL_LONG, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-  else
-    Pair_Kernel_TpA<PAIR_BORN, COUL_LONG, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-
-  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
-}
-
-
-
-#undef _rhoinv
-#undef _sigma
-#undef _a
-#undef _c
-#undef _d
-
--- a/lib/cuda/pair_born_coul_long_cuda_cu.h
+++ b/lib/cuda/pair_born_coul_long_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-#ifdef CUDA_USE_BINNING
-extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
-#else
-extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
-#endif
--- a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
+++ b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
@ -1,36 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-__device__ inline F_CFLOAT PairBornCuda_Eval(const F_CFLOAT &rsq, const int ij_type, F_CFLOAT &factor_lj, int &eflag, ENERGY_CFLOAT &evdwl)
-{
-  const F_CFLOAT r2inv = F_F(1.0) / rsq;
-  const F_CFLOAT r = _RSQRT_(r2inv);
-  const F_CFLOAT r6inv = r2inv * r2inv * r2inv;
-  const F_CFLOAT rexp = _EXP_((_sigma[ij_type] - r) * _rhoinv[ij_type]);
-  const F_CFLOAT forceborn = _a[ij_type] * _rhoinv[ij_type] * r * rexp -
-                            F_F(6.0) * _c[ij_type] * r6inv + F_F(8.0) * _d[ij_type] * r2inv * r6inv;
-
-  if(eflag) evdwl += factor_lj * (_a[ij_type] * rexp - _c[ij_type] * r6inv
-                                    + _d[ij_type] * r2inv * r6inv - _offset[ij_type]);
-
-  return factor_lj * forceborn * r2inv;
-}
--- a/lib/cuda/pair_buck_coul_cut_cuda.cu
+++ b/lib/cuda/pair_buck_coul_cut_cuda.cu
@ -1,75 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-
-#define _rhoinv MY_AP(coeff1)
-#define _buck1 MY_AP(coeff2)
-#define _buck2 MY_AP(coeff3)
-#define _a MY_AP(coeff4)
-#define _c MY_AP(coeff5)
-
-#include "pair_buck_coul_cut_cuda_cu.h"
-
-#include <time.h>
-void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_Pair_Init_AllStyles(sdata, 5, true);
-}
-
-void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
-{
-
-
-  static  short init = 0;
-
-  if(! init) {
-    init = 1;
-    Cuda_PairBuckCoulCutCuda_Init(sdata);
-  }
-
-  dim3 grid, threads;
-  int sharedperproc;
-
-
-  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
-
-  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
-
-  if(sdata->pair.use_block_per_atom)
-    Pair_Kernel_BpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-  else
-    Pair_Kernel_TpA<PAIR_BUCK, COUL_CUT, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-
-  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
-}
-
-
-#undef _rhoinv
-#undef _buck1
-#undef _buck2
-#undef _a
-#undef _c
-
--- a/lib/cuda/pair_buck_coul_cut_cuda_cu.h
+++ b/lib/cuda/pair_buck_coul_cut_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-#ifdef CUDA_USE_BINNING
-extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
-#else
-extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
-#endif
--- a/lib/cuda/pair_buck_coul_long_cuda.cu
+++ b/lib/cuda/pair_buck_coul_long_cuda.cu
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-
-#define _rhoinv MY_AP(coeff1)
-#define _buck1 MY_AP(coeff2)
-#define _buck2 MY_AP(coeff3)
-#define _a MY_AP(coeff4)
-#define _c MY_AP(coeff5)
-
-#include "pair_buck_coul_long_cuda_cu.h"
-
-#include <time.h>
-
-void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_Pair_Init_AllStyles(sdata, 5, true);
-}
-
-void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
-{
-
-
-  static  short init = 0;
-
-  if(! init) {
-    init = 1;
-    Cuda_PairBuckCoulLongCuda_Init(sdata);
-  }
-
-  dim3 grid, threads;
-  int sharedperproc;
-
-
-  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, true, 192);
-
-  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
-
-  if(sdata->pair.use_block_per_atom)
-    Pair_Kernel_BpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-  else
-    Pair_Kernel_TpA<PAIR_BUCK, COUL_LONG, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-
-  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
-}
-
-
-
-
-#undef _rhoinv
-#undef _buck1
-#undef _buck2
-#undef _a
-#undef _c
-
--- a/lib/cuda/pair_buck_coul_long_cuda_cu.h
+++ b/lib/cuda/pair_buck_coul_long_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-#ifdef CUDA_USE_BINNING
-extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
-#else
-extern "C" void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
-#endif
--- a/lib/cuda/pair_buck_cuda.cu
+++ b/lib/cuda/pair_buck_cuda.cu
@ -1,77 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-
-#define _rhoinv MY_AP(coeff1)
-#define _buck1 MY_AP(coeff2)
-#define _buck2 MY_AP(coeff3)
-#define _a MY_AP(coeff4)
-#define _c MY_AP(coeff5)
-
-#include "pair_buck_cuda_cu.h"
-#include "pair_buck_cuda_kernel_nc.cu"
-
-#include <time.h>
-
-void Cuda_PairBuckCuda_Init(cuda_shared_data* sdata)
-{
-  Cuda_Pair_Init_AllStyles(sdata, 5);
-}
-
-void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom)
-{
-
-
-  static  short init = 0;
-
-  if(! init) {
-    init = 1;
-    Cuda_PairBuckCuda_Init(sdata);
-  }
-
-  dim3 grid, threads;
-  int sharedperproc;
-
-
-  Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc, false, 192);
-
-  cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
-
-  if(sdata->pair.use_block_per_atom)
-    Pair_Kernel_BpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-  else
-    Pair_Kernel_TpA<PAIR_BUCK, COUL_NONE, DATA_NONE>
-    <<< grid, threads, sharedperproc* sizeof(ENERGY_CFLOAT)*threads.x, streams[1]>>> (eflag, vflag, eflag_atom, vflag_atom);
-
-  Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
-}
-
-
-#undef _rhoinv
-#undef _buck1
-#undef _buck2
-#undef _a
-#undef _c
-
--- a/lib/cuda/pair_buck_cuda_cu.h
+++ b/lib/cuda/pair_buck_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-#ifdef CUDA_USE_BINNING
-extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, int eflag, int vflag);
-#else
-extern "C" void Cuda_PairBuckCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag, int eflag_atom, int vflag_atom);
-#endif
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);`