git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@8921 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2012-10-08 15:29:39 +00:00 · 2012-10-08 15:29:39 +00:00 · 856a237400
parent abc9afbec6
commit 856a237400
186 changed files with 0 additions and 22340 deletions
--- a/lib/cuda/Makefile
+++ b/lib/cuda/Makefile
@ -1,4 +0,0 @@
-#Makefile for liblammpscuda.a 
-#No need to modify anything here! The CUDA path is inserted into Makefile.common
-
-include Makefile.cudalib
--- a/lib/cuda/Makefile.common
+++ b/lib/cuda/Makefile.common
@ -1,124 +0,0 @@
-#Common commandline argument interpreter for compilation with lammpscuda (USER-CUDA) installed
-
-# make options:
-# emu=1        switch to cuda emulation mode (otherwise: use gpu)
-# dbg=1        print a lot of debugging output during runtime
-# verbose=1    output nvcc command line during compilation
-# keep=1       do not delete temporary compilation files (.ii, .cubin, ...)
-# cufft=1      use cuda's fast fourier transformation lib "cufft" where possible (otherwise: use cpu fftw)
-# binning=1    create virtual particle grid (neighbor-lists otherwise); currently this is not supported
-# precision=1  single precision (global setting)
-# precision=2  double precision (global setting)
-
-SHELL = /bin/sh
-
-# System-specific settings
-
-#CUDA_INSTALL_PATH = /usr/local/cuda
-CUDA_INSTALL_PATH = /home/crtrott/lib/cuda
-# e.g. in Gentoo
-# CUDA_INSTALL_PATH = /opt/cuda
-
-
-#//////////////////////////////////////////////////////////////////////////////////////////////
-# no need to change anything below this line
-#//////////////////////////////////////////////////////////////////////////////////////////////
-
-#use CPU FFT if cufft=0 is requested.
-FALLBACK_FFT = 1
-
-#default settings for compiler switches
-ifdef COMPILELIB 
-include Makefile.defaults
-else
-include ../../lib/cuda/Makefile.defaults
-endif
-
-#shell echo "Compiling with precision = " ${precision} ", arch = " ${arch} ", cufft = " ${cufft} ", dbg = " ${dbg} ", prec_timer = " ${prec_timer}
-
-CUDA_FLAGS := -I${CUDA_INSTALL_PATH}/include -DUNIX 
-CUDA_USRLIB_CONDITIONAL := -L${CUDA_INSTALL_PATH}/lib -L${CUDA_INSTALL_PATH}/lib64
-
-# debug setting
-ifeq ($(strip $(dbg)), 1)
-	CUDA_FLAGS += -D_DEBUG -g
-	NVCC_FLAGS += -g -G 
-else
-	NVCC_FLAGS += --compiler-options -fno-strict-aliasing -O2
-endif
-
-# skip timing on Mac and Windows manually
-ifeq ($(strip $(prec_timer)), 0)
-	CUDA_FLAGS += -DNO_PREC_TIMING
-endif
-
-# set fft routine
-ifeq ($(strip $(cufft)), 0)
-	ifneq ($(FALLBACK_FFT), 1)
-	    FFT_INC = -DFFT_NONE
-	    FFT_PATH = 
-	    FFT_LIB = 
-		CUDA_FLAGS += -DFFT_NONE
-	endif
-else
-	CUDA_FLAGS += -DFFT_CUFFT
-	CUDA_USRLIB_CONDITIONAL += -lcufft
-endif
-
-# make global precision setting
-
-ifeq ($(strip $(precision)), 1)
-	CUDA_FLAGS += -DCUDA_PRECISION=1
-else
-	ifeq ($(strip $(precision)), 3)
-		CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2
-	else
-		ifeq ($(strip $(precision)), 4)
-			CUDA_FLAGS += -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2
-		else
-			CUDA_FLAGS += -DCUDA_PRECISION=2
-		endif
-	endif
-endif
-
-# make architecture settings
-ifeq ($(strip $(arch)), 13)
-	CUDA_FLAGS += -DCUDA_ARCH=13
-	SMVERSIONFLAGS	:= -arch sm_13
-else
-  ifeq ($(strip $(arch)), 20)
-	 CUDA_FLAGS += -DCUDA_ARCH=20 
-	 #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-	 NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-	 SMVERSIONFLAGS	:= -arch sm_20
-  else
-     ifeq ($(strip $(arch)), 21)
-	   CUDA_FLAGS += -DCUDA_ARCH=20 
-	   #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-	   NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-	   SMVERSIONFLAGS	:= -arch sm_21
-     else
-       ifeq ($(strip $(arch)), 30)
-           CUDA_FLAGS += -DCUDA_ARCH=20
-           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-           SMVERSIONFLAGS       := -arch sm_30
-       else
-         ifeq ($(strip $(arch)), 35)
-           CUDA_FLAGS += -DCUDA_ARCH=20
-           #NVCC_FLAGS += -ftz=false -prec-div=true -prec-sqrt=true
-           NVCC_FLAGS += -ftz=true -prec-div=false -prec-sqrt=false
-           SMVERSIONFLAGS       := -arch sm_35
-         else         
-           CUDA_FLAGS += -DCUDA_ARCH=99  
-           SMVERSIONFLAGS	:= -arch sm_13
-         endif
-       endif
-     endif
-  endif
-endif
-
-
-
-CCFLAGS := $(CCFLAGS) $(CUDA_FLAGS) \
-		-I$(CUDA_INSTALL_PATH)/include 
--- a/lib/cuda/Makefile.cudalib
+++ b/lib/cuda/Makefile.cudalib
@ -1,87 +0,0 @@
-#Makefile for liblammpscuda.a 
-#No need to modify anything here! The CUDA path is inserted into Makefile.common
-
-.DEFAULT: lib
-
-COMPILELIB := 1
-
-SHELL = /bin/sh
-
-CUDA_SRC_DIR = ../cuda
-CUDA_TEMP = $(CUDA_SRC_DIR)/.lastmake
-CUDA_TEMP_DUMMY := $(shell touch $(CUDA_TEMP) )
-include $(CUDA_TEMP)
-CUDA_CU = $(wildcard $(CUDA_SRC_DIR)/*_kernel.cu)
-CUDA_CUO = $(CUDA_CU:_kernel.cu=_cu.o)
-CUDA_OBJ = $(subst $(CUDA_SRC_DIR)/,,$(CUDA_CUO)) 
-CUDA_DEP = $(CUDA_OBJ:.o=.d)
-
-NVCC_FLAGS := 
-
-VPATH = $(CUDA_SRC_DIR)
-
-#rewriting default settings if new ones are specified
-
-
-ifdef precision
-tmp := $(shell sed -i 's|precision ?= [0-9]|precision ?= '${precision}'|g' Makefile.defaults)
-endif
-
-ifdef arch
-tmp := $(shell sed -i 's|arch ?= [0-9][0-9]|arch ?= '${arch}'|g' Makefile.defaults)
-endif
-
-ifdef cufft
-tmp := $(shell sed -i 's|cufft ?= [0-9]|cufft ?= '${cufft}'|g' Makefile.defaults)
-endif
-
-ifdef dbg
-tmp := $(shell sed -i 's|dbg ?= [0-9]|dbg ?= '${dbg}'|g' Makefile.defaults)
-endif
-
-ifdef prec_timer
-tmp := $(shell sed -i 's|prec_timer ?= [0-9]|prec_timer ?= '${prec_timer}'|g' Makefile.defaults)
-endif
-
-include Makefile.common
-
-tmp := $(shell sed -i '2 d' Makefile.lammps)
-tmp := $(shell sed -i '2 d' Makefile.lammps)
-tmp := $(shell sed -i '1a CUDA_FLAGS := ${CUDA_FLAGS}' Makefile.lammps)
-tmp := $(shell sed -i '2a CUDA_USRLIB_CONDITIONAL := ${CUDA_USRLIB_CONDITIONAL}' Makefile.lammps)
-
-# verbose nvcc output during compilation
-ifeq ($(verbose), 1)
-	VERBOSE :=
-	NVCC_FLAGS += --ptxas-options=-v
-else
-	VERBOSE := @
-endif
-
-# keep temporary compilation files of nvcc
-ifeq ($(keep), 1)
-	NVCC_FLAGS += -keep -Xptxas="--verbose"
-endif
-
-
-NVCC := $(CUDA_INSTALL_PATH)/bin/nvcc
-CUDA_INCLUDES =  -I./ -I$(CUDA_INSTALL_PATH)/include -I../../src/USER-CUDA
-CUDA_USRLIB = 
-
-# Link target
-
-lib: $(CUDA_OBJ)
-	$(NVCC) -lib $(CUDA_OBJ) $(CUDA_FLAGS) $(CUDA_USRLIB_CONDITIONAL) -o liblammpscuda.a
-
-clean:
-	rm $(CUDA_SRC_DIR)/*.o
-	rm liblammpscuda.a
-	
-# Library target
-
-
-# Cuda compilation rules
-
-%_cu.o: %.cu %_kernel.cu %_cu.h cuda_shared.h
-	$(VERBOSE)$(NVCC) $(NVCC_FLAGS) $(CUDA_FLAGS) $(CUDA_INCLUDES) $(CUDA_USRLIB) $(SMVERSIONFLAGS) -o $@ -c $<
-
--- a/lib/cuda/Makefile.defaults
+++ b/lib/cuda/Makefile.defaults
@ -1,19 +0,0 @@
-
-#precision setting: 1 single, 2 double, 4 mixed
-precision ?= 1
-
-#verbose setting: 0 no, 1 yes
-verbose ?= 1
-
-#GPU architecture (compute capability): 13, 20, 21
-arch ?= 20
-
-#Using cufft (should not be changed)
-cufft ?= 1
-
-#Using dbg mode 
-dbg ?= 0
-
-#On mac machines set this to 0 in order to avoid usage of linux specific precision timer
-prec_timer ?= 1
-
--- a/lib/cuda/Makefile.lammps
+++ b/lib/cuda/Makefile.lammps
@ -1,8 +0,0 @@
-# Settings that the LAMMPS build will import when this package library is used
-CUDA_FLAGS := -I/usr/local/cuda/include -DUNIX  -DFFT_CUFFT -DCUDA_PRECISION=1 -DX_PRECISION=2 -DV_PRECISION=2 -DCUDA_ARCH=20 
-CUDA_USRLIB_CONDITIONAL := -L/usr/local/cuda/lib -L/usr/local/cuda/lib64 -lcufft
- 
- user-cuda_SYSINC = ${CUDA_FLAGS}
- user-cuda_SYSLIB = -lcuda -lcudart -lrt
- user-cuda_SYSPATH = $(CUDA_USRLIB_CONDITIONAL)
-
--- a/lib/cuda/README
+++ b/lib/cuda/README
@ -1,26 +0,0 @@
-This directory has source files to build a library that LAMMPS
-links against when using the USER-CUDA package.
-
-When you are done building this library, two files should
-exist in this directory:
-
-liblammpscuda.a		the library LAMMPS will link against
-Makefile.lammps		settings the LAMMPS Makefile will import
-
-The latter file will have settings like this (can be omitted if blank):
-
-user-cuda_SYSINC = -I$(CUDA_INSTALL_PATH)/include 
-user-cuda_SYSLIB = -lcuda -lcudart -lrt 
-user-cuda_SYSPATH = -L$(CUDA_INSTALL_PATH)/lib64 -L$(CUDA_INSTALL_PATH)/lib $(CUDA_USRLIB_CONDITIONAL)
-
-SYSINC is for settings needed to compile LAMMPS source files
-SYSLIB is for additional system libraries needed by this package
-SYSPATH is the path(s) to where those libraries are
-
-You must insure these settings are correct for your system, else
-the LAMMPS build will likely fail.
-
-------------------------------------------------------------------------
-
-Christian - there needs to additional info here about how
-to build the lammpscuda lib.
--- a/lib/cuda/atom_vec_angle_cuda.cu
+++ b/lib/cuda/atom_vec_angle_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int ANGLE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
-
-#include "atom_vec_angle_cuda_cu.h"
-
-void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<ANGLE_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
-}
-
-int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
-
-int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
--- a/lib/cuda/atom_vec_angle_cuda_cu.h
+++ b/lib/cuda/atom_vec_angle_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_ANGLE_CUDA_CU_H_
-#define ATOM_VEC_ANGLE_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecAngleCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecAngleCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
-extern "C" int Cuda_AtomVecAngleCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecAngleCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAngleCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-extern "C" int Cuda_AtomVecAngleCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-
-#endif /*ATOM_VEC_ANGLE2_CUDA_CU_H_*/
--- a/lib/cuda/atom_vec_atomic_cuda.cu
+++ b/lib/cuda/atom_vec_atomic_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int ATOMIC_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
-
-#include "atom_vec_atomic_cuda_cu.h"
-
-void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<ATOMIC_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
-}
-
-int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
-
-int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
--- a/lib/cuda/atom_vec_atomic_cuda_cu.h
+++ b/lib/cuda/atom_vec_atomic_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_ATOMIC_CUDA_CU_H_
-#define ATOM_VEC_ATOMIC_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecAtomicCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecAtomicCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
-extern "C" int Cuda_AtomVecAtomicCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecAtomicCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-extern "C" int Cuda_AtomVecAtomicCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-
-#endif /*ATOM_VEC_ATOMIC2_CUDA_CU_H_*/
--- a/lib/cuda/atom_vec_charge_cuda.cu
+++ b/lib/cuda/atom_vec_charge_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int CHARGE_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
-
-#include "atom_vec_charge_cuda_cu.h"
-
-void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<CHARGE_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
-}
-
-int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
-
-int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
--- a/lib/cuda/atom_vec_charge_cuda_cu.h
+++ b/lib/cuda/atom_vec_charge_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_CHARGE_CUDA_CU_H_
-#define ATOM_VEC_CHARGE_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecChargeCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecChargeCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
-extern "C" int Cuda_AtomVecChargeCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecChargeCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecChargeCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-extern "C" int Cuda_AtomVecChargeCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-
-#endif /*ATOM_VEC_CHARGE2_CUDA_CU_H_*/
--- a/lib/cuda/atom_vec_cuda.cu
+++ b/lib/cuda/atom_vec_cuda.cu
@ -1,564 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX atom_vec_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "cuda_wrapper_cu.h"
-#include "crm_cuda_utils.cu"
-
-#include "atom_vec_cuda_kernel.cu"
-
-int AtomVecCuda_CountDataItems(unsigned int data_mask)
-{
-	int n=0;
-	if(data_mask & X_MASK) n+=3;
-	if(data_mask & V_MASK) n+=3;
-	if(data_mask & F_MASK) n+=3;
-	if(data_mask & TAG_MASK) n++;
-	if(data_mask & TYPE_MASK) n++;
-	if(data_mask & MASK_MASK) n++;	
-	if(data_mask & IMAGE_MASK) n++;	
-	if(data_mask & Q_MASK) n++;
-	if(data_mask & MOLECULE_MASK) n++;
-	if(data_mask & RMASS_MASK) n++;
-	if(data_mask & RADIUS_MASK) n++;
-	if(data_mask & DENSITY_MASK) n++;
-	if(data_mask & OMEGA_MASK) n+=3;
-	if(data_mask & TORQUE_MASK) n++;
-	
-	//if(data_mask & NSPECIAL_MASK) n+=3;	
-	return n;
-}
-
-void Cuda_AtomVecCuda_UpdateBuffer(cuda_shared_data* sdata,int size)
-{
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_AtomVecCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-}
-
-template <const unsigned int data_mask>
-void Cuda_AtomVecCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(image)   , & sdata->atom.image.dev_data, sizeof(int*) );
-		if(data_mask & Q_MASK) cudaMemcpyToSymbolAsync(MY_CONST(q)       , & sdata->atom.q    .dev_data, sizeof(F_FLOAT*) );
-		if(data_mask & MOLECULE_MASK) cudaMemcpyToSymbolAsync(MY_CONST(molecule)   , & sdata->atom.molecule.dev_data, sizeof(int*) );
-		if(data_mask & RADIUS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(radius)   , & sdata->atom.radius.dev_data, sizeof(int*) );
-		if(data_mask & DENSITY_MASK) cudaMemcpyToSymbolAsync(MY_CONST(density)   , & sdata->atom.density.dev_data, sizeof(int*) );
-		if(data_mask & RMASS_MASK) cudaMemcpyToSymbolAsync(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(int*) );
-		if(data_mask & OMEGA_MASK) cudaMemcpyToSymbolAsync(MY_CONST(omega)   , & sdata->atom.omega.dev_data, sizeof(int*) );
-		//if(data_mask & NSPECIAL_MASK) cudaMemcpyToSymbol(MY_CONST(nspecial)   , & sdata->atom.nspecial.dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(flag)    , & sdata->flag, sizeof(int*) );
-}
-
-template <const unsigned int data_mask>
-void Cuda_AtomVecCuda_Init(cuda_shared_data* sdata)
-{
-	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... start\n"); )
-	if(sdata->atom.update_nmax)
-	Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-  if(sdata->atom.update_nlocal)
-    cudaMemcpyToSymbolAsync(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... post Nmax\n"); )
-    cudaMemcpyToSymbolAsync(MY_CONST(prd)   , sdata->domain.prd, 3*sizeof(X_FLOAT));
-	cudaMemcpyToSymbolAsync(MY_CONST(sublo)   , & sdata->domain.sublo, 3*sizeof(X_FLOAT) );
-	cudaMemcpyToSymbolAsync(MY_CONST(subhi)   , & sdata->domain.subhi, 3*sizeof(X_FLOAT) );
-	cudaMemcpyToSymbolAsync(MY_CONST(flag)   , & sdata->flag, sizeof(int*) );
-	cudaThreadSynchronize();
-	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_Init ... end\n"); )
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-
-    timespec time1,time2;
-	if(sdata->atom.update_nmax) 
-		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbolAsync(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
-	int size=(n*n_data_items)*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	  
-	if(sdata->atom.nlocal>0)
-	{
-	  cudaMemset( sdata->flag,0,sizeof(int));
-
-clock_gettime(CLOCK_REALTIME,&time1);
-
-	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
-	  Cuda_AtomVecCuda_PackComm_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
-	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_kernel_pack+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-      
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
-      if(not sdata->overlap_comm)
-        cudaMemcpy(buf_send, sdata->buffer, n*n_data_items*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-
-clock_gettime(CLOCK_REALTIME,&time1);
-sdata->cuda_timings.comm_forward_download+=
-      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	  int aflag;
-	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm: Kernel execution failed");
-		
-	}		
-    return n_data_items*n;
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-	MYDBG(printf(" # CUDA: AtomVecCuda_PackComm_Self\n");)
-    timespec time1,time2;
-	if(sdata->atom.update_nmax) 
-		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
-	int size=(n*n_data_items)*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-	static int count=-1;
-	count++;
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-
-clock_gettime(CLOCK_REALTIME,&time1);
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self:Pre Kernel execution failed");
-
-	  Cuda_AtomVecCuda_PackComm_Self_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_kernel_self+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackComm_Self: Kernel execution failed");
-	}	
-	
-    return n_data_items*n;
-}
-
-
-template <const unsigned int data_mask>
-void Cuda_AtomVecCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
-{
-    timespec time1,time2;
-
-	if(sdata->atom.update_nmax) 
-		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
-	int size=(n*n_data_items)*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-clock_gettime(CLOCK_REALTIME,&time1);
-      if(not sdata->overlap_comm||iswap<0)
-	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n_data_items*n*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_upload+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
-	  Cuda_AtomVecCuda_UnpackComm_Kernel<data_mask><<<grid, threads,0>>>(n,first,buf);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time1);
-sdata->cuda_timings.comm_forward_kernel_unpack+=
-      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackComm: Kernel execution failed");
-		
-	}		
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
-{
-	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... start dim %i \n",dim); )
-      CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: pre Kernel execution failed");
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	Cuda_AtomVecCuda_Init<data_mask>(sdata);
-	int size=n*sizeof(double);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-
-    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
-    
-    int3 layout=getgrid(sdata->atom.nlocal,sizeof(int),256,true);
-    dim3 threads(layout.z, 1, 1);
-    dim3 grid(layout.x, layout.y, 1);		
-	
-    timespec time1,time2;
-	clock_gettime(CLOCK_REALTIME,&time1);
-    
-  Cuda_AtomVecCuda_PackExchangeList_Kernel<<<grid, threads,(threads.x+1)*sizeof(int)>>>(n-1,dim);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: Kernel execution failed");
-
-	clock_gettime(CLOCK_REALTIME,&time2);
-	sdata->cuda_timings.comm_exchange_kernel_pack+=
-      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	cudaMemcpy(buf_send, sdata->buffer, sizeof(double), cudaMemcpyDeviceToHost);
-	int return_value = ((int*) buf_send)[0];
-	if(n>1+return_value)
-  cudaMemcpy(buf_send, sdata->buffer, (1+return_value)*sizeof(double), cudaMemcpyDeviceToHost);
-  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchangeList: return copy failed");
-
-clock_gettime(CLOCK_REALTIME,&time1);
-sdata->cuda_timings.comm_exchange_download+=
-      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-	
-	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchangeList ... done\n"); )
-	return return_value;
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... start \n"); )
-	if(sdata->atom.update_nmax)
-	Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-	//if(sdata->atom.update_nlocal)
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1;
-	int size=(nsend*n_data_items+1)*sizeof(double);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-
-    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
-    
-    int3 layout=getgrid(nsend,0);
-    dim3 threads(layout.z, 1, 1);
-    dim3 grid(layout.x, layout.y, 1);		
-	
-    timespec time1,time2;
-	clock_gettime(CLOCK_REALTIME,&time1);
-
-    Cuda_AtomVecCuda_PackExchange_Kernel<data_mask><<<grid, threads,0>>>(nsend,(int*) copylist);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackExchange: Kernel execution failed");
-	
-	clock_gettime(CLOCK_REALTIME,&time2);
-	sdata->cuda_timings.comm_exchange_kernel_pack+=
-      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
-
-	clock_gettime(CLOCK_REALTIME,&time1);
-	sdata->cuda_timings.comm_exchange_download+=
-      	time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-	
-	MYDBG( printf("# CUDA: Cuda_AtomVecCuda_PackExchange ... done\n"); )
-	return nsend*n_data_items+1;
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-	Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask)+1;
-	
-	int size=(nsend*n_data_items+1)*sizeof(double);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-	cudaMemcpyToSymbol(MY_CONST(flag)   , & sdata->flag, sizeof(int*) );
-
-    cudaMemset((int*) (sdata->flag),0,sizeof(int));
-    if(nsend)
-    {
-      int3 layout=getgrid(nsend,0);
-      dim3 threads(layout.z, 1, 1);
-      dim3 grid(layout.x, layout.y, 1);		
-	  if(sdata->atom.nlocal>0)
-	  {
-    	timespec time1,time2;
-		clock_gettime(CLOCK_REALTIME,&time1);
-	    
-	    cudaMemcpy(sdata->buffer,buf_send , size, cudaMemcpyHostToDevice);
-
-		clock_gettime(CLOCK_REALTIME,&time2);
-		sdata->cuda_timings.comm_exchange_upload+=
-      		time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	    Cuda_AtomVecCuda_UnpackExchange_Kernel<data_mask><<<grid, threads,0>>>(sdata->exchange_dim,nsend,(int*) copylist);
-	    cudaThreadSynchronize();
-
-		clock_gettime(CLOCK_REALTIME,&time1);
-		sdata->cuda_timings.comm_exchange_kernel_unpack+=
-      		time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	    CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackExchange: Kernel execution failed");
-	  }	
-    }
-    int naccept;
- 	cudaMemcpy((void*)&naccept, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-    
-	return naccept;
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-      timespec atime1,atime2;
-	  clock_gettime(CLOCK_REALTIME,&atime1);
- 	if(sdata->atom.update_nmax) 
-		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
- 	if(sdata->atom.update_nlocal)
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	  clock_gettime(CLOCK_REALTIME,&atime2);
-	  sdata->cuda_timings.test1+=
-        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
-
-	int size=nsend*n_data_items*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }}	
-
-	int3 layout=getgrid(nsend);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	  
-	if(sdata->atom.nlocal>0)
-	{
-      timespec time1,time2;
-	  clock_gettime(CLOCK_REALTIME,&time1);
-
-	  Cuda_AtomVecCuda_PackBorder_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,nsend,sdata->comm.maxlistlength,iswap,dx,dy,dz);
-	  cudaThreadSynchronize();
-
-	  clock_gettime(CLOCK_REALTIME,&time2);
-	  sdata->cuda_timings.comm_border_kernel_pack+=
-      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	  cudaMemcpy(buf_send, sdata->buffer, size, cudaMemcpyDeviceToHost);
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder: Kernel execution failed");
-
-	  clock_gettime(CLOCK_REALTIME,&time1);
-	  sdata->cuda_timings.comm_border_download+=
-        time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-		
-	}		
-    return nsend*n_data_items;
-}
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-	if(sdata->atom.update_nlocal)
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
-
-	int size=n*n_data_items*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }}	
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-      timespec time1,time2;
-	  clock_gettime(CLOCK_REALTIME,&time1);
-
-	  Cuda_AtomVecCuda_PackBorder_Self_Kernel<data_mask><<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
-	  cudaThreadSynchronize();
-	
-	  clock_gettime(CLOCK_REALTIME,&time2);
-	  sdata->cuda_timings.comm_border_kernel_self+=
-      	time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_PackBorder_Self: Kernel execution failed");
-		
-	}	
-    return n*n_data_items;
-}
-
-
-template <const unsigned int data_mask>
-int Cuda_AtomVecCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-      timespec atime1,atime2;
-	  clock_gettime(CLOCK_REALTIME,&atime1);
-	if(sdata->atom.update_nmax) 
-		Cuda_AtomVecCuda_UpdateNmax<data_mask>(sdata);
-
-	if(sdata->atom.update_nlocal)
-	  cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	  clock_gettime(CLOCK_REALTIME,&atime2);
-	  sdata->cuda_timings.test1+=
-        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-
-	int n_data_items=AtomVecCuda_CountDataItems(data_mask);
-
-	int size=n*n_data_items*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_AtomVecCuda_UpdateBuffer(sdata,size);
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-      timespec time1,time2;
-	  clock_gettime(CLOCK_REALTIME,&time1);
-      
-      cudaMemset((int*) (sdata->flag),0,sizeof(int));
-	  cudaMemcpy(sdata->buffer,(void*)buf_recv, size, cudaMemcpyHostToDevice);
-
-	  clock_gettime(CLOCK_REALTIME,&time2);
-	  sdata->cuda_timings.comm_border_upload+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	  Cuda_AtomVecCuda_UnpackBorder_Kernel<data_mask><<<grid, threads,0>>>(n,first);
-	  cudaThreadSynchronize();
-
-	  clock_gettime(CLOCK_REALTIME,&time1);
-	  sdata->cuda_timings.comm_border_kernel_unpack+=
-      	time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	  cudaMemcpy(&sdata->comm.grow_flag,sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-	  
-	  CUT_CHECK_ERROR("Cuda_AtomVecCuda_UnpackBorder: Kernel execution failed");
-		
-	}		
-	return sdata->comm.grow_flag;
-}
-
-
-#include "atom_vec_angle_cuda.cu"
-#include "atom_vec_atomic_cuda.cu"
-#include "atom_vec_charge_cuda.cu"
-#include "atom_vec_full_cuda.cu"
-//#include "atom_vec_granular_cuda.cu"
--- a/lib/cuda/atom_vec_cuda_cu.h
+++ b/lib/cuda/atom_vec_cuda_cu.h
--- a/lib/cuda/atom_vec_cuda_kernel.cu
+++ b/lib/cuda/atom_vec_cuda_kernel.cu
@ -1,371 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-#define RIMLARGER 1.000001
-#define RIMSMALLER 0.999999
-#define SMALL 1e-5
-
-extern __shared__ int shared[];
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-    int j=list[i];
-    if(j>_nmax) _flag[0]=1;
-    int k=0;
-    if(data_mask & X_MASK){
-    ((X_FLOAT*) buffer)[i+k*n]=_x[j] + dx; k++;
-    ((X_FLOAT*) buffer)[i+k*n] = _x[j+_nmax] + dy; k++;
-    ((X_FLOAT*) buffer)[i+k*n] = _x[j+2*_nmax] + dz; k++;}
-    if(data_mask & V_MASK){
-    ((X_FLOAT*) buffer)[i+k*n]=_v[j]; k++;
-    ((X_FLOAT*) buffer)[i+k*n] = _v[j+_nmax]; k++;
-    ((X_FLOAT*) buffer)[i+k*n] = _v[j+2*_nmax]; k++;}
-    if(data_mask & OMEGA_MASK){
-    ((X_FLOAT*) buffer)[i+k*n]=_omega[j]; k++;
-    ((X_FLOAT*) buffer)[i+k*n] = _omega[j+_nmax]; k++;
-    ((X_FLOAT*) buffer)[i+k*n] = _omega[j+2*_nmax]; k++;}
-    if(data_mask & RADIUS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_radius[j]; k++;
-    if(data_mask & RMASS_MASK) ((X_FLOAT*) buffer)[i+k*n]=_rmass[j]; k++;
-  }
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-  	int j=i;
-    j=list[i];
-    if(data_mask & X_MASK){
-      _x[i+first]=_x[j] + dx;
-      _x[i+first+_nmax] = _x[j+_nmax] + dy;
-      _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;}
-    if(data_mask & V_MASK){
-      _v[i+first]=_v[j];
-      _v[i+first+_nmax] = _v[j+_nmax];
-      _v[i+first+2*_nmax] = _v[j+2*_nmax];}
-    if(data_mask & OMEGA_MASK) {
-  	  _omega[i+first] = _omega[j];
-  	  _omega[i+first+_nmax] = _omega[j+_nmax];
-  	  _omega[i+first+2*_nmax] = _omega[j+2*_nmax];} 
-    if(data_mask & RADIUS_MASK) _radius[i+first]=_radius[j]; 
-    if(data_mask & RMASS_MASK) _rmass[i+first]=_rmass[j]; 
-  }
-}
-
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_UnpackComm_Kernel(int n,int first,void* buffer)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<n)
-  {
-  	int k=0;
-    if(data_mask & X_MASK){
-  	  _x[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
-  	  _x[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
-  	  _x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
-    if(data_mask & V_MASK){
-  	  _v[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
-  	  _v[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
-  	  _v[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
-    if(data_mask & OMEGA_MASK){
-  	  _omega[i+first]=((X_FLOAT*) buffer)[i+k*n]; k++;
-  	  _omega[i+first+_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;
-  	  _omega[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+k*n]; k++;}
-    if(data_mask & RADIUS_MASK) _radius[i+first] = ((X_FLOAT*) buffer)[i+k*n]; k++;
-    if(data_mask & RMASS_MASK) _rmass[i+first] = ((X_FLOAT*) buffer)[i+k*n]; k++;
-  }
-}
-
-
-__global__ void Cuda_AtomVecCuda_PackExchangeList_Kernel(int n,int dim)
-{
-  double* buf=(double*) _buffer;
-  buf=&buf[1];
-
-  //X_FLOAT lo=slablo[iswap];
-  //X_FLOAT hi=slabhi[iswap];
-
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  bool add=false;
-  
-  if(i<_nlocal)
-  {
-  	double xdim_tmp=static_cast <double> (_x[i+dim*_nmax]);
-    if (xdim_tmp < _sublo[dim] || xdim_tmp >= _subhi[dim]) 
-    {
-      add=true;
-    }
-  }
-  shared[threadIdx.x]=add?1:0;
-  __syncthreads();
-  int nsend=0;
-  if(threadIdx.x==0)
-  {
-    for(int k=0;k<blockDim.x;k++)
-    {
-      if(shared[k]) {nsend++; shared[k]=nsend;}
-    }
-    shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
-  }
-  __syncthreads();
-      
-  nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
-  if(add&&nsend+1<n)
-    buf[nsend]=i;
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackExchange_Kernel(int nsend, int* copylist)
-{
-  double* buf=(double*) _buffer;
-  int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(k>=nsend) return;
-  buf=&buf[1+k];
-  
-  int i=static_cast <int> (buf[0]);
-  int j=copylist[k];
-    
-  int m=1;
-  if(data_mask & X_MASK){
-  buf[(m++)*nsend] = static_cast <double> (_x[i]);
-  buf[(m++)*nsend] = static_cast <double> (_x[i+_nmax]);
-  buf[(m++)*nsend] = static_cast <double> (_x[i+2*_nmax]);}
-  if(data_mask & V_MASK){
-  buf[(m++)*nsend] = _v[i];
-  buf[(m++)*nsend] = _v[i+_nmax];
-  buf[(m++)*nsend] = _v[i+2*_nmax];}
-  if(data_mask & TAG_MASK) 		buf[(m++)*nsend] = _tag[i];
-  if(data_mask & TYPE_MASK) 	buf[(m++)*nsend] = _type[i];
-  if(data_mask & MASK_MASK) 	buf[(m++)*nsend] = _mask[i];
-  if(data_mask & IMAGE_MASK) 	buf[(m++)*nsend] = _image[i];
-  if(data_mask & Q_MASK) 		buf[(m++)*nsend] = _q[i];
-  if(data_mask & MOLECULE_MASK) buf[(m++)*nsend] = _molecule[i];
-  if(data_mask & RADIUS_MASK) 	buf[(m++)*nsend] = _radius[i];
-  if(data_mask & DENSITY_MASK) 	buf[(m++)*nsend] = _density[i];
-  if(data_mask & RMASS_MASK) 	buf[(m++)*nsend] = _rmass[i];
-  if(data_mask & OMEGA_MASK) {
-  buf[(m++)*nsend] = _omega[i];
-  buf[(m++)*nsend] = _omega[i+_nmax];
-  buf[(m++)*nsend] = _omega[i+2*_nmax];}
-  
-/*  if(data_mask & NSPECIAL_MASK) 
-  {
-  	buf[(m++)*nsend] = _nspecial[i];
-  	buf[(m++)*nsend] = _nspecial[i+_nmax];
-  	buf[(m++)*nsend] = _nspecial[i+2* _nmax];
-  }*/
-
-  if(i>=_nlocal) return; 
-  if(data_mask & X_MASK){
-  _x[i] = _x[j];
-  _x[i+_nmax] = _x[j+_nmax];
-  _x[i+2*_nmax] = _x[j+2*_nmax];}
-  if(data_mask & V_MASK){
-  _v[i] = _v[j];
-  _v[i+_nmax] = _v[j+_nmax];
-  _v[i+2*_nmax] = _v[j+2*_nmax];}
-  if(data_mask & TAG_MASK)		_tag[i] 	= _tag[j];
-  if(data_mask & TYPE_MASK)		_type[i] 	= _type[j];
-  if(data_mask & MASK_MASK)		_mask[i] 	= _mask[j];
-  if(data_mask & IMAGE_MASK)	_image[i] 	= _image[j];
-  
-  if(data_mask & Q_MASK) 		_q[i] 		= _q[j];
-  if(data_mask & MOLECULE_MASK) _molecule[i]= _molecule[j];
-  if(data_mask & RADIUS_MASK) 	_radius[i] 	= _radius[j];
-  if(data_mask & DENSITY_MASK) 	_density[i] = _density[j];
-  if(data_mask & RMASS_MASK) 	_rmass[i] 	= _rmass[j];
-  if(data_mask & OMEGA_MASK) 
-  {
-  	_omega[i] = _omega[j];
-  	_omega[i+_nmax] = _omega[j+_nmax];
-  	_omega[i+2*_nmax] = _omega[j+2*_nmax];
-  } 
-  	/* if(data_mask & NSPECIAL_MASK) 
-  {
-  	_nspecial[i] = _nspecial[j];
-  	_nspecial[i+_nmax] = _nspecial[j+_nmax];
-  	_nspecial[i+2* _nmax] = _nspecial[j+2* _nmax];
-  }*/
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_UnpackExchange_Kernel(int dim,int nsend,int* copylist)
-{
-  double* buf=(double*) _buffer;
-  int k=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(k>=nsend) return;
-  buf=&buf[1+k];
-  int i=-1;
-  double xdim_tmp = buf[(1+dim)*nsend];
-  if(xdim_tmp>=_sublo[dim]-SMALL && xdim_tmp<_subhi[dim]+SMALL)
-  {   	
-  	 i=atomicAdd(_flag,1)+_nlocal;
-  	 
-  	 int m=1;
-  	 if(data_mask & X_MASK){
-     _x[i] = buf[(m++)*nsend];
-     _x[i+_nmax] = buf[(m++)*nsend];
-     _x[i+2*_nmax] = buf[(m++)*nsend];}
-  	 if(data_mask & V_MASK){
-     _v[i] = buf[(m++)*nsend];
-     _v[i+_nmax] = buf[(m++)*nsend];
-     _v[i+2*_nmax] = buf[(m++)*nsend];}
-     if(data_mask & TAG_MASK) 	_tag[i] = buf[(m++)*nsend];
-     if(data_mask & TYPE_MASK) 	_type[i] = buf[(m++)*nsend];
-     if(data_mask & MASK_MASK) 	_mask[i] = buf[(m++)*nsend];
-     if(data_mask & IMAGE_MASK) _image[i] = buf[(m++)*nsend];
-  
-     if(data_mask & Q_MASK) _q[i] = buf[(m++)*nsend];
-     if(data_mask & MOLECULE_MASK) _molecule[i] = buf[(m++)*nsend];
-  	 if(data_mask & RADIUS_MASK) _radius[i] = buf[(m++)*nsend];
-  	 if(data_mask & DENSITY_MASK) _density[i] = buf[(m++)*nsend];
-  	 if(data_mask & RMASS_MASK) _rmass[i] = buf[(m++)*nsend];
-  	 if(data_mask & OMEGA_MASK) 
-  	 {
-  		_omega[i] = buf[(m++)*nsend];
-  		_omega[i+_nmax] = buf[(m++)*nsend];
-  		_omega[i+2*_nmax] = buf[(m++)*nsend];
-  	 }
-   /*  if(data_mask & NSPECIAL_MASK) 
-     {
-  	   _nspecial[i] = buf[(m++)*nsend];
-  	   _nspecial[i+_nmax] = buf[(m++)*nsend];
-  	   _nspecial[i+2*_nmax] = buf[(m++)*nsend];
-     }*/
-  }
-  copylist[k]=i;
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackBorder_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-    int j=list[i];
-    int m=0;
-	if(data_mask & X_MASK) {
-	((X_FLOAT*) _buffer)[i+(m++)*n]= _x[j] + dx;
-    ((X_FLOAT*) _buffer)[i+(m++)*n] = _x[j+_nmax] + dy;
-    ((X_FLOAT*) _buffer)[i+(m++)*n] = _x[j+2*_nmax] + dz;}
-    if(data_mask & V_MASK) {
-	((X_FLOAT*) _buffer)[i+(m++)*n]= _v[j];
-    ((X_FLOAT*) _buffer)[i+(m++)*n] = _v[j+_nmax];
-    ((X_FLOAT*) _buffer)[i+(m++)*n] = _v[j+2*_nmax];}
-    if(data_mask & TAG_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _tag[j];
-    if(data_mask & TYPE_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _type[j];
-    if(data_mask & MASK_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _mask[j];
-    if(data_mask & Q_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _q[j];
-    if(data_mask & MOLECULE_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _molecule[j];
-  	if(data_mask & RADIUS_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _radius[i];
-  	if(data_mask & DENSITY_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _density[i];
-  	if(data_mask & RMASS_MASK) ((X_FLOAT*) _buffer)[i+(m++)*n] = _rmass[i];
-  	if(data_mask & OMEGA_MASK) {
-  	((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i];
-  	((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i+_nmax];
-  	((X_FLOAT*) _buffer)[i+(m++)*n] = _omega[i+2*_nmax];}
-  }
-}
-
-
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_PackBorder_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-    int j=list[i];
-  
-	if(data_mask & X_MASK) {
-    _x[i+first]= _x[j] + dx;
-    _x[i+first+_nmax] = _x[j+_nmax] + dy;
-    _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;}
-	if(data_mask & V_MASK) {
-    _v[i+first]= _v[j];
-    _v[i+first+_nmax] = _v[j+_nmax];
-    _v[i+first+2*_nmax] =  _v[j+2*_nmax];}
-	if(data_mask & TAG_MASK) _tag[i+first] = _tag[j];
-	if(data_mask & TYPE_MASK) _type[i+first] = _type[j];
-	if(data_mask & MASK_MASK) _mask[i+first] = _mask[j];
-	if(data_mask & Q_MASK) _q[i+first] = _q[j];
-	if(data_mask & MOLECULE_MASK) _molecule[i+first] = _molecule[j];
-  	if(data_mask & RADIUS_MASK) _radius[i+first] = _radius[j];
-  	if(data_mask & DENSITY_MASK) _density[i+first] = _density[j];
-  	if(data_mask & RMASS_MASK) _rmass[i+first] = _rmass[j];
-	if(data_mask & OMEGA_MASK) {
-    _omega[i+first]= _omega[j];
-    _omega[i+first+_nmax] = _omega[j+_nmax];
-    _omega[i+first+2*_nmax] =  _omega[j+2*_nmax];}
-  }
-}
-
-template <const unsigned int data_mask>
-__global__ void Cuda_AtomVecCuda_UnpackBorder_Kernel(int n,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<n)
-  {
-  	if(i+first<_nmax)
-  	{
-      int m=0;
-	  if(data_mask & X_MASK) {
-  	  _x[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  _x[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  _x[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
-	  if(data_mask & V_MASK) {
-  	  _v[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  _v[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  _v[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
-  	  if(data_mask & TAG_MASK) _tag[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
-  	  if(data_mask & TYPE_MASK) _type[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
-  	  if(data_mask & MASK_MASK) _mask[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
-  	  if(data_mask & Q_MASK) _q[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  if(data_mask & MOLECULE_MASK) _molecule[i+first] = static_cast<int> (((X_FLOAT*) _buffer)[i+(m++)*n]);
-  	  if(data_mask & RADIUS_MASK) _radius[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  if(data_mask & DENSITY_MASK) _density[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  if(data_mask & RMASS_MASK) _rmass[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
-	  if(data_mask & OMEGA_MASK) {
-  	  _omega[i+first]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  _omega[i+first+_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];
-  	  _omega[i+first+2*_nmax]=((X_FLOAT*) _buffer)[i+(m++)*n];}
-  	}
-  	else
-  	{
-  	  _flag[0]=1;
-  	}
-  }
-}
-
-
--- a/lib/cuda/atom_vec_full_cuda.cu
+++ b/lib/cuda/atom_vec_full_cuda.cu
@ -1,85 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-const unsigned int FULL_DATA_MASK=X_MASK|V_MASK|F_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
-
-#include "atom_vec_full_cuda_cu.h"
-
-void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata)
-{
-  return Cuda_AtomVecCuda_Init<FULL_DATA_MASK>(sdata);
-}
-
-int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchangeList<data_mask>(sdata,n,dim,buf_send);
-}
-
-int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|IMAGE_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackExchange<data_mask>(sdata,nsend,buf_send,copylist);
-}
-
-int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder<data_mask>(sdata,nsend,iswap,buf_send,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_PackBorder_Self<data_mask>(sdata,n,iswap,first,pbc,pbc_flag);
-}
-
-int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
-
-int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-  const unsigned int data_mask=X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK;
-  return Cuda_AtomVecCuda_UnpackBorder<data_mask>(sdata,n,first,buf_recv);
-}
--- a/lib/cuda/atom_vec_full_cuda_cu.h
+++ b/lib/cuda/atom_vec_full_cuda_cu.h
@ -1,15 +0,0 @@
-#ifndef ATOM_VEC_FULL_CUDA_CU_H_
-#define ATOM_VEC_FULL_CUDA_CU_H_
-
-extern "C" void Cuda_AtomVecFullCuda_Init(cuda_shared_data* sdata);
-extern "C" int Cuda_AtomVecFullCuda_PackExchangeList(cuda_shared_data* sdata,int n,int dim,void* buf_send);
-extern "C" int Cuda_AtomVecFullCuda_PackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecFullCuda_UnpackExchange(cuda_shared_data* sdata,int nsend,void* buf_send,void* copylist);
-extern "C" int Cuda_AtomVecFullCuda_PackBorder(cuda_shared_data* sdata,int nsend,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_PackBorderVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_PackBorder_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_PackBorderVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" int Cuda_AtomVecFullCuda_UnpackBorder(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-extern "C" int Cuda_AtomVecFullCuda_UnpackBorderVel(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-
-#endif /*ATOM_VEC_FULL2_CUDA_CU_H_*/
--- a/lib/cuda/binning.cu
+++ b/lib/cuda/binning.cu
@ -1,196 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifdef CUDA_USE_BINNING
-#include <stdio.h>
-#define MY_PREFIX binning
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "binning_cu.h"
-#include "binning_kernel.cu"
-
-void Cuda_PreBinning(cuda_shared_data* sdata)
-{
-	// initialize only on first call
-	short init = 0;
-	if(! init)
-	{
-		init = 1;
-		int cuda_dummy_type = sdata->atom.ntypes + 1;
-		X_FLOAT outside[3] =
-		{
-			(sdata->domain.subhi[0] - sdata->domain.sublo[0])/1000.0,
-			(sdata->domain.subhi[1] - sdata->domain.sublo[1])/1000.0,
-			(sdata->domain.subhi[2] - sdata->domain.sublo[2])/1000.0
-		};
-		cudaMemcpyToSymbol("binned_size_all"    , & sdata->atom.binned_type.dim[0]  , sizeof(unsigned) );
-		cudaMemcpyToSymbol("cuda_dummy_type"    , & cuda_dummy_type                 , sizeof(int)      );
-		cudaMemcpyToSymbol("outside"            , & outside                         , sizeof(X_FLOAT)*3);
-		cudaMemcpyToSymbol(MY_CONST(binned_type), & sdata->atom.binned_type.dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(binned_x)   , & sdata->atom.binned_x   .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_tag) , & sdata->atom.binned_tag .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(subhi)      ,   sdata->domain.subhi             , sizeof(X_FLOAT)*3);
-		// bin_nmax == blockDim.x
-		
-		// printf("# CUDA: MY_CONST(binned_type) = %s\n", MY_CONST(binned_type));
-		// int* p = pre_binning_binned_type; // pre_binning_binned_type is defined here!!
-	}
-	
-	dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_dim[2], 1);
-	dim3 threads(sdata->domain.bin_nmax, 1, 1);
-	
-	MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning grid = (%u, %u, %u)\n", grid.x, grid.y, grid.z);)
-	MYDBG(printf("# CUDA: Cuda_PreBinning: pre binning threads = (%u, %u, %u)\n", threads.x, threads.y, threads.z);	)
-	PreBinning_Kernel<<<grid, threads>>> ();
-	cudaThreadSynchronize();
-    MYDBG(printf("ERROR-CUDA pre_binning: %s\n",cudaGetErrorString(cudaGetLastError())));
-	CUT_CHECK_ERROR("Cuda_PreBinning: binning Kernel execution failed");
-}
-
-void Cuda_Binning(cuda_shared_data* sdata)
-{
-	MYDBG(	// check assumption in debug mode
-		if(sdata->atom.x.dim[1] != 3)
-		{
-			printf("# CUDA: Cuda_Binning: binning error: atom array dimensions not Nx3\n");
-			return;
-		}
-	)
-	
-	// initialize only on first call
-	short init = 0;
-	if(! init)
-	{
-		init = 0;
-		X_FLOAT const_rez_bin_size[3] = 
-		{
-			(1.0 * sdata->domain.bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
-			(1.0 * sdata->domain.bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
-			(1.0 * sdata->domain.bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
-		};
-		cudaMemcpyToSymbol("bin_error_count"        , & sdata->atom.bin_error_count.dev_data, sizeof(X_FLOAT)*1);
-		cudaMemcpyToSymbol("rez_bin_size"           , & const_rez_bin_size                  , sizeof(X_FLOAT)*3);
-		cudaMemcpyToSymbol(MY_CONST(bin_count_all)  , & sdata->atom.bin_count_all  .dev_data, sizeof(unsigned*));
-		cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
-		cudaMemcpyToSymbol(MY_CONST(bin_dim)        ,   sdata->domain.bin_dim               , sizeof(int3)     );
-		cudaMemcpyToSymbol(MY_CONST(bin_nmax)       , & sdata->domain.bin_nmax              , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(binned_f)       , & sdata->atom.binned_f       .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_q)       , & sdata->atom.binned_q       .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_rmass)   , & sdata->atom.binned_rmass   .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_tag)     , & sdata->atom.binned_tag     .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(binned_type)    , & sdata->atom.binned_type    .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(binned_v)       , & sdata->atom.binned_v       .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binpos)         , & sdata->atom.binpos         .dev_data, sizeof(int*));
-		cudaMemcpyToSymbol(MY_CONST(f)              , & sdata->atom.f              .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(natoms)         , & sdata->atom.nall                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(nghost)         , & sdata->atom.nghost                  , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)         , & sdata->atom.nlocal                  , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(q)              , & sdata->atom.q              .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(rmass)          , & sdata->atom.rmass          .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(sublo)          ,   sdata->domain.sublo                 , sizeof(X_FLOAT)*3);
-		cudaMemcpyToSymbol(MY_CONST(tag)            , & sdata->atom.tag            .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(type)           , & sdata->atom.type           .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(v)              , & sdata->atom.v              .dev_data, sizeof(V_FLOAT*) );
-	}
-	
-	dim3 grid((unsigned)(1 + sdata->atom.nlocal/64.0), 1, 1);
-	MYDBG( printf("# CUDA: Cuda_Binning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
-	dim3 threads(64, 1, 1);
-	
-	cudaMemset((int*) (sdata->atom.bin_count_all.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
-	cudaMemset((int*) (sdata->atom.bin_count_local.dev_data),0,sizeof(int)*(sdata->domain.bin_dim[0])*(sdata->domain.bin_dim[1])*(sdata->domain.bin_dim[2]));
-	cudaMemset(sdata->atom.bin_error_count.dev_data,0,sizeof(int)*1);
-	int binning_error_l[1];
-	
-	
-	Binning_Kernel<<<grid, threads>>> (
-		(X_FLOAT*) (sdata->atom.       x.dev_data),
-		(X_FLOAT*) (sdata->atom.binned_x.dev_data),
-		sdata->atom.q_flag,
-		0,
-		sdata->atom.rmass_flag
-	);
-	cudaThreadSynchronize();
-	cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
-	if(binning_error_l[0]!=0) 
-	{
-		printf("CUDA-ERROR: binning local: could not bin %i atoms\n",binning_error_l[0]);
-	}
-	CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
-	
-	grid.x=(unsigned)(1 + (sdata->atom.nall-sdata->atom.nlocal)/32.0);
-	MYDBG( printf("# CUDA: Cuda_Binning Ghost: grid dim.x = %u\n", grid.x); )
-	
-	
-	Binning_Kernel<<<grid, threads>>> (
-		(X_FLOAT*) (sdata->atom.       x.dev_data),
-		(X_FLOAT*) (sdata->atom.binned_x.dev_data),
-		sdata->atom.q_flag,
-		sdata->atom.nlocal,
-		sdata->atom.rmass_flag
-	);
-	cudaThreadSynchronize();
-	cudaMemcpy((void*) binning_error_l,(void*) sdata->atom.bin_error_count.dev_data,1*sizeof(int),cudaMemcpyDeviceToHost);
-	if(binning_error_l[0]!=0) printf("CUDA-ERROR: binning ghost: could not bin %i atoms\n",binning_error_l[0]);
-}
-
-void Cuda_ReverseBinning(cuda_shared_data* sdata)
-{
-	// initialize only on first call
-	short init = 0;
-	if(! init)
-	{
-		init = 0;
-		cudaMemcpyToSymbol(MY_CONST(bin_count_all)  , & sdata->atom.bin_count_all  .dev_data, sizeof(unsigned*));
-		cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
-		cudaMemcpyToSymbol(MY_CONST(bin_dim)        ,   sdata->domain.bin_dim               , sizeof(int3)     );
-		cudaMemcpyToSymbol(MY_CONST(binned_f)       , & sdata->atom.binned_f       .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_q)       , & sdata->atom.binned_q       .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_tag)     , & sdata->atom.binned_tag     .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(binned_type)    , & sdata->atom.binned_type    .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(binned_v)       , & sdata->atom.binned_v       .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)              , & sdata->atom.f              .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(natoms)         , & sdata->atom.nall                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(q)              , & sdata->atom.q              .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(tag)            , & sdata->atom.tag            .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(type)           , & sdata->atom.type           .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(v)              , & sdata->atom.v              .dev_data, sizeof(V_FLOAT*) );
-	}
-	
-	dim3 grid((unsigned)(1 + sdata->atom.nlocal/32.0), 1, 1);
-	MYDBG( printf("# CUDA: Cuda_ReverseBinning: grid dim.x = %u (nlocal: %i)\n", grid.x,sdata->atom.nlocal); )
-	dim3 threads(32, 1, 1);
-
-	ReverseBinning_Kernel<<<grid, threads>>> (
-		(X_FLOAT*) (sdata->atom.       x.dev_data),
-		(X_FLOAT*) (sdata->atom.binned_x.dev_data),
-		sdata->atom.q_flag
-	);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Binning: reverse binning Kernel execution failed");
-}
-
-#endif
--- a/lib/cuda/binning_cu.h
+++ b/lib/cuda/binning_cu.h
@ -1,28 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_PreBinning(cuda_shared_data* sdata);
-extern "C" void Cuda_Binning(cuda_shared_data* sdata);
-extern "C" void Cuda_ReverseBinning(cuda_shared_data* sdata);
--- a/lib/cuda/binning_kernel.cu
+++ b/lib/cuda/binning_kernel.cu
@ -1,149 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-// load some variables from shared cuda data into device's constant memory:
-__device__ __constant__ X_FLOAT rez_bin_size[3];
-__device__ __constant__ unsigned* bin_error_count;
-
-__device__ __constant__ int cuda_dummy_type;
-__device__ __constant__ unsigned binned_size_all;
-__device__ __constant__ X_FLOAT outside[3];
-
-__global__ void PreBinning_Kernel()
-{
-	const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
-	
-	if(bin < gridDim.x * gridDim.y) // TODO: suspected always to be true
-	{
-		_binned_type[blockDim.x * bin + threadIdx.x] = cuda_dummy_type;
-		
-		const int i = 3*blockDim.x * bin + threadIdx.x;
-		X_FLOAT* binned_x = _binned_x + i; *binned_x = _subhi[0] + outside[0] * (1+i);
-		binned_x += blockDim.x;            *binned_x = _subhi[1] + outside[1] * (1+i);
-		binned_x += blockDim.x;            *binned_x = _subhi[2] + outside[2] * (1+i);
-		_binned_tag[i]=-1;
-	}
-}
-
-__global__ void Binning_Kernel(X_FLOAT* x, X_FLOAT* binned_x, int q_flag, int offset, int rmass_flag)
-{
-	const unsigned i = blockDim.x * blockIdx.x + threadIdx.x+offset;
-	
-	int binatoms=_natoms;
-	if(offset==0) binatoms=_nlocal ;
-
-	if(i < binatoms)
-	{
-		// copy atom position from global device memory to local register
-		// in this 3 steps to get as much coalesced access as possible
-		X_FLOAT my_xX, my_xY, my_xZ;
-		x += i;        my_xX = *x;
-		x += _nmax;  my_xY = *x;
-		x += _nmax;  my_xZ = *x;
-		//my_xX=x[i];
-		//my_xY=x[i+_nmax];
-		//my_xZ=x[i+2*_nmax];
-		
-		
-		// calculate flat bin index
-		int bx=__float2int_rd(rez_bin_size[0] * (my_xX - _sublo[0]))+2;
-		int by=__float2int_rd(rez_bin_size[1] * (my_xY - _sublo[1]))+2;
-		int bz=__float2int_rd(rez_bin_size[2] * (my_xZ - _sublo[2]))+2;
-
-		bx-=bx*negativCUDA(1.0f*bx);
-		bx-=(bx-_bin_dim.x+1)*negativCUDA(1.0f*_bin_dim.x-1.0f-1.0f*bx);
-		by-=by*negativCUDA(1.0f*by);
-		by-=(by-_bin_dim.y+1)*negativCUDA(1.0f*_bin_dim.y-1.0f-1.0f*by);
-		bz-=bz*negativCUDA(1.0f*bz);
-		bz-=(bz-_bin_dim.z+1)*negativCUDA(1.0f*_bin_dim.z-1.0f-1.0f*bz);
-		
-
-		const unsigned j = _bin_dim.z * ( _bin_dim.y *bx+by)+bz;
-		
-		// add new atom to bin, get bin-array position
-		const unsigned k = atomicAdd(& _bin_count_all[j], 1);
-		if(offset==0) atomicAdd(& _bin_count_local[j], 1);
-		if(k < _bin_nmax)
-		{
-			// copy register values back to global device memory
-			unsigned pos = 3*_bin_nmax * j + k;
-			_binpos[i]=pos;
-			binned_x += pos;       *binned_x = my_xX;
-			binned_x += _bin_nmax; *binned_x = my_xY;
-			binned_x += _bin_nmax; *binned_x = my_xZ;
-			
-			// also copy velocity and force accordingly
-		
-			binned_x  = _binned_v + pos; x  = _v + i;  *binned_x = *x;
-			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
-			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
-			
-			binned_x  = _binned_f + pos; x  = _f + i;  *binned_x = *x;
-			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
-			binned_x += _bin_nmax;       x += _nmax;   *binned_x = *x;
-			
-			pos = _bin_nmax * j + k;
-				_binned_type [pos] = _type[i];
-				_binned_tag  [pos] = _tag[i];
-			if(rmass_flag)
-				_binned_rmass[pos] = _rmass[i];
-			if(q_flag)
-				_binned_q    [pos] = _q[i];
-		}
-		else
-		{	// normally, this should not happen:
-			int errorn=atomicAdd(bin_error_count, 1);
-			MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); )
-		}
-	}
-}
-
-__global__ void ReverseBinning_Kernel(X_FLOAT* x, X_FLOAT* binned_x,int q_flag)
-{
-	const unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
-	
-	if(i < _nlocal)
-	{
-	unsigned bin_pos3 = _binpos[i];
-	unsigned bin_pos=bin_pos3/(3*_bin_nmax);
-	bin_pos*=_bin_nmax;
-	bin_pos+=bin_pos3-bin_pos*3;
-
-		binned_x  = _binned_x + bin_pos3; x  = x + i; *x = *binned_x;
-		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
-		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
-		
-		binned_x  = _binned_v + bin_pos3; x  = _v + i; *x = *binned_x;
-		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
-		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
-		
-		binned_x  = _binned_f + bin_pos3; x  = _f + i; *x = *binned_x;
-		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
-		binned_x += _bin_nmax;           x += _nmax;  *x = *binned_x;
-		
-		
-		_type[i] = _binned_type[bin_pos];
-		_tag[i] = _binned_tag[bin_pos];
-		if(q_flag) _q[i] = _binned_q[bin_pos];
-	}
-}
--- a/lib/cuda/comm_cuda.cu
+++ b/lib/cuda/comm_cuda.cu
@ -1,485 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX comm_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "comm_cuda_cu.h"
-#include "comm_cuda_kernel.cu"
-#include <ctime>
-
-void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n)
-{
-		int size=n*3*sizeof(X_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-}
-
-
-void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbolAsync(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbolAsync(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbolAsync(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbolAsync(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbolAsync(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbolAsync(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*) 	  );
-}
-
-
-void Cuda_CommCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_CommCuda_UpdateNmax(sdata);
-	int ntypesp=sdata->atom.ntypes+1;
-    cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)   , &ntypesp, sizeof(int));
-    cudaMemcpyToSymbol(MY_CONST(prd)   , sdata->domain.prd, 3*sizeof(X_FLOAT));
-    cudaMemcpyToSymbol(MY_CONST(flag)  , &sdata->flag, sizeof(int*));
-  	cudaMemcpyToSymbol(MY_CONST(debugdata)  , &sdata->debugdata, sizeof(int*));
-}
-
-int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-
-    timespec time1,time2;
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	  
-	if(sdata->atom.nlocal>0)
-	{
-	  cudaMemset( sdata->flag,0,sizeof(int));
-
-clock_gettime(CLOCK_REALTIME,&time1);
-
-	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
-	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
-	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_kernel_pack+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-      
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-      if(not sdata->overlap_comm)
-        cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-
-clock_gettime(CLOCK_REALTIME,&time1);
-sdata->cuda_timings.comm_forward_download+=
-      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	  int aflag;
-	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-		
-	}		
-    return 3*n;
-}
-
-int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-
-    timespec time1,time2;
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*6*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	  
-	if(sdata->atom.nlocal>0)
-	{
-	  cudaMemset( sdata->flag,0,sizeof(int));
-
-clock_gettime(CLOCK_REALTIME,&time1);
-
-	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
-	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
-	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_kernel_pack+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-      
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-      if(not sdata->overlap_comm)
-        cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-
-clock_gettime(CLOCK_REALTIME,&time1);
-sdata->cuda_timings.comm_forward_download+=
-      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	  int aflag;
-	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
-		
-	}		
-    return 6*n;
-}
-
-int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
-    timespec time1,time2;
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-	static int count=-1;
-	count++;
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-
-clock_gettime(CLOCK_REALTIME,&time1);
-
-	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_kernel_self+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
-	}	
-	
-    return 3*n;
-}
-
-int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
-    timespec time1,time2;
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*6*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-	static int count=-1;
-	count++;
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-
-clock_gettime(CLOCK_REALTIME,&time1);
-
-	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_kernel_self+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
-	}	
-	
-    return 6*n;
-}
-
-void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
-{
-    timespec time1,time2;
-
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-clock_gettime(CLOCK_REALTIME,&time1);
-      if(not sdata->overlap_comm||iswap<0)
-	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_upload+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
-	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time1);
-sdata->cuda_timings.comm_forward_kernel_unpack+=
-      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
-		
-	}		
-}
-
-void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
-{
-    timespec time1,time2;
-
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*6*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-clock_gettime(CLOCK_REALTIME,&time1);
-
-      if(not sdata->overlap_comm||iswap<0)
-	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
-
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_forward_upload+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
-	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
-	  cudaThreadSynchronize();
-
-clock_gettime(CLOCK_REALTIME,&time1);
-sdata->cuda_timings.comm_forward_kernel_unpack+=
-      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
-		
-	}		
-}
-
-int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(F_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-
-
-	F_FLOAT* buf=(F_FLOAT*)buf_send;
-	F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data;
-	f_dev+=first;
-	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
-	buf+=n; f_dev+=sdata->atom.nmax;
-	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
-	buf+=n; f_dev+=sdata->atom.nmax;
-	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
-	return 	n*3;
-}
-
-
-void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(F_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-	  cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice);
-	  Cuda_CommCuda_UnpackReverse_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");		
-	}		
-}
-
-void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_CommCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,n);
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	  
-	if(sdata->atom.nlocal>0)
-	{
-	  Cuda_CommCuda_UnpackReverse_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
-		
-	}		
-}
-
-
-int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap)
-{
-	MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
-    timespec time1,time2;
-	if(sdata->atom.update_nmax)
-	Cuda_CommCuda_UpdateNmax(sdata);
-  if(sdata->atom.update_nlocal)
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	if(sdata->buffer_new or (80>sdata->buffersize))
-		Cuda_CommCuda_UpdateBuffer(sdata,10);
-	int n;
-	if (!bordergroup || ineed >= 2)
-	n=nlast-nfirst+1;
-	else
-	{
-	  n=atom_nfirst;
-	  if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1;
-	}
-	int3 layout=getgrid(n,0,512,true);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x+1, layout.y, 1);
-
-
-    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
-
-clock_gettime(CLOCK_REALTIME,&time1);
-	if(style==1)
-	Cuda_CommCuda_BuildSendlist_Single<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
-	else
-	Cuda_CommCuda_BuildSendlist_Multi<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
-    cudaThreadSynchronize();
-clock_gettime(CLOCK_REALTIME,&time2);
-sdata->cuda_timings.comm_border_kernel_buildlist+=
-      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-
-	  CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
-    int nsend;
-	cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
-	return nsend;
-	
-	
-}
-
--- a/lib/cuda/comm_cuda_cu.h
+++ b/lib/cuda/comm_cuda_cu.h
@ -1,35 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag);
-extern "C" int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbcflag);
-extern "C" int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag);
-extern "C" int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbcflag);
-extern "C" void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1);
-extern "C" void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap=-1);
-extern "C" int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send);
-extern "C" void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv);
-extern "C" void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first);
-extern "C" int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap);
--- a/lib/cuda/comm_cuda_kernel.cu
+++ b/lib/cuda/comm_cuda_kernel.cu
@ -1,353 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void Cuda_CommCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-    int j=list[i];
-    if(j>_nmax) _flag[0]=1;
-    ((X_FLOAT*) buffer)[i]=_x[j] + dx;
-    ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
-    ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
-  }
-}
-
-__global__ void Cuda_CommCuda_PackCommVel_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,void* buffer)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-    int j=list[i];
-    if(j>_nmax) _flag[0]=1;
-    ((X_FLOAT*) buffer)[i]=_x[j] + dx;
-    ((X_FLOAT*) buffer)[i+1*n] = _x[j+_nmax] + dy;
-    ((X_FLOAT*) buffer)[i+2*n] = _x[j+2*_nmax] + dz;
-    ((X_FLOAT*) buffer)[i+3*n]=_v[j];
-    ((X_FLOAT*) buffer)[i+4*n] = _v[j+_nmax];
-    ((X_FLOAT*) buffer)[i+5*n] = _v[j+2*_nmax];
-  }
-}
-
-__global__ void Cuda_CommCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-  	int j=i;
-    j=list[i];
-    
-    _x[i+first]=_x[j] + dx;
-    _x[i+first+_nmax] = _x[j+_nmax] + dy;
-    _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
-  }
-}
-
-__global__ void Cuda_CommCuda_PackCommVel_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-  	int j=i;
-    j=list[i];
-    
-    _x[i+first]=_x[j] + dx;
-    _x[i+first+_nmax] = _x[j+_nmax] + dy;
-    _x[i+first+2*_nmax] = _x[j+2*_nmax] + dz;
-    _v[i+first]=_v[j];
-    _v[i+first+_nmax] = _v[j+_nmax];
-    _v[i+first+2*_nmax] = _v[j+2*_nmax];
-  }
-}
-
-__global__ void Cuda_CommCuda_UnpackComm_Kernel(int n,int first,void* buffer)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<n)
-  {
-  _x[i+first]=((X_FLOAT*) buffer)[i];
-  _x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
-  _x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
-  }
-}
-
-
-__global__ void Cuda_CommCuda_UnpackCommVel_Kernel(int n,int first,void* buffer)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<n)
-  {
-  _x[i+first]=((X_FLOAT*) buffer)[i];
-  _x[i+first+_nmax]=((X_FLOAT*) buffer)[i+1*n];
-  _x[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+2*n];
-  _v[i+first]=((X_FLOAT*) buffer)[i+3*n];
-  _v[i+first+_nmax]=((X_FLOAT*) buffer)[i+4*n];
-  _v[i+first+2*_nmax]=((X_FLOAT*) buffer)[i+5*n];
-  }
-}
-
-__global__ void Cuda_CommCuda_PackReverse_Kernel(int n,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<n)
-  {
-  ((F_FLOAT*) _buffer)[i]=_f[i+first];
-  ((F_FLOAT*) _buffer)[i+n] = _f[i+first+_nmax];
-  ((F_FLOAT*) _buffer)[i+2*n] = _f[i+first+2*_nmax];
-  }
-  
-}
-
-__global__ void Cuda_CommCuda_UnpackReverse_Kernel(int* sendlist,int n,int maxlistlength,int iswap)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-  int j=list[i];
-  _f[j]+=((F_FLOAT*)_buffer)[i];
-  _f[j+_nmax]+=((F_FLOAT*) _buffer)[i+n];
-  _f[j+2*_nmax]+=((F_FLOAT*) _buffer)[i+2*n];
-  }
-  
-}
-
-__global__ void Cuda_CommCuda_UnpackReverse_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-  int j=list[i];
-  
-  _f[j]+=_f[i+first];
-  _f[j+_nmax]+=_f[i+first+_nmax];
-  _f[j+2*_nmax]+=_f[i+first+2*_nmax];
-  }
-  
-}
-
-extern __shared__ int shared[];
-
-__global__ void Cuda_CommCuda_BuildSendlist_Single(int bordergroup,int ineed,int atom_nfirst,
-int nfirst,int nlast,int dim,int iswap,X_FLOAT* slablo, X_FLOAT* slabhi,int* sendlist,int maxlistlength)
-{
-	  int* list=sendlist+iswap*maxlistlength;
-	  X_FLOAT lo=slablo[iswap];
-	  X_FLOAT hi=slabhi[iswap];
-      bool add=false;
-      if (!bordergroup || ineed >= 2) {
- 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
- 	  if(i<nlast)
-	    if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
-          add=true;
-	    }
-      shared[threadIdx.x]=add?1:0;
-      
-      __syncthreads();
-      
-      int nsend=0;
-      if(threadIdx.x==0)
-      {
-        for(int k=0;k<blockDim.x;k++)
-        {
-           if(shared[k]) {nsend++; shared[k]=nsend;}
-        }
-        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
-      }
-      
-      __syncthreads();
-      
-      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
-      if(add&&nsend<maxlistlength)
-	      list[nsend] = i;
-      
-
-      } else {
-
- 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	  if(i<atom_nfirst)
-	    if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
-          add=true;
-	    }
-
-      shared[threadIdx.x]=add?1:0;
-      
-      __syncthreads();
-      
-      int nsend=0;
-      if(threadIdx.x==0)
-      {
-        for(int k=0;k<blockDim.x;k++)
-        {
-           if(shared[k]) {nsend++; shared[k]=nsend;}
-        }
-        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
-      }
-      
-      __syncthreads();
-      
-      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
-      if(add&&nsend<maxlistlength)
-	      list[nsend] = i;
-
-      __syncthreads();
-      
-      add=false;
-	  i+=_nlocal;
-	  if(i < nlast)
-	    if (_x[i+dim*_nmax] >= lo && _x[i+dim*_nmax] <= hi) {
-	      add=true;
-	    }
-       shared[threadIdx.x]=add?1:0;
-      
-      __syncthreads();
-      
-      nsend=0;
-      if(threadIdx.x==0)
-      {
-        for(int k=0;k<blockDim.x;k++)
-        {
-           if(shared[k]) {nsend++; shared[k]=nsend;}
-        }
-        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
-      }
-      
-      __syncthreads();
-      
-      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
-      if(add&&nsend<maxlistlength)
-	      list[nsend] = i;
-
-    }	
-}
-
-
-__global__ void Cuda_CommCuda_BuildSendlist_Multi(int bordergroup,int ineed,int atom_nfirst
-,int nfirst,int nlast,int dim,int iswap,X_FLOAT* multilo, X_FLOAT* multihi,int* sendlist,int maxlistlength)
-{
-	  int* list=sendlist+iswap*maxlistlength;
-	  X_FLOAT* mlo=&multilo[iswap*_cuda_ntypes];
-	  X_FLOAT* mhi=&multihi[iswap*_cuda_ntypes];
-	  int itype=0;
-      bool add=false;
-      if (!bordergroup || ineed >= 2) {
- 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x+nfirst;
- 	  if(i<nlast)
- 	  {
- 	    itype=_type[i];
-	    if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
-          add=true;
-	    }
- 	  }
-      shared[threadIdx.x]=add?1:0;
-      
-      __syncthreads();
-      
-      int nsend=0;
-      if(threadIdx.x==0)
-      {
-        for(int k=0;k<blockDim.x;k++)
-        {
-           if(shared[k]) {nsend++; shared[k]=nsend;}
-        }
-        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
-      }
-      
-      __syncthreads();
-      
-      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
-      if(add&&nsend<maxlistlength)
-	      list[nsend] = i;
-      
-
-      } else {
-
- 	  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	  if(i<atom_nfirst)
-	  {
- 	    itype=_type[i];
-	    if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
-          add=true;
-	    }
-	  }
-      shared[threadIdx.x]=add?1:0;
-      
-      __syncthreads();
-      
-      int nsend=0;
-      if(threadIdx.x==0)
-      {
-        for(int k=0;k<blockDim.x;k++)
-        {
-           if(shared[k]) {nsend++; shared[k]=nsend;}
-        }
-        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
-      }
-      
-      __syncthreads();
-      
-      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
-      if(add&&nsend<maxlistlength)
-	      list[nsend] = i;
-
-      __syncthreads();
-      
-      add=false;
-	  i+=_nlocal;
-	  if(i < nlast)
-	  {
-	  	itype = _type[i];
-	    if (_x[i+dim*_nmax] >= mlo[itype] && _x[i+dim*_nmax] <= mhi[itype]) {
-	      add=true;
-	    }
-	  }
-       shared[threadIdx.x]=add?1:0;
-      
-      __syncthreads();
-      
-      nsend=0;
-      if(threadIdx.x==0)
-      {
-        for(int k=0;k<blockDim.x;k++)
-        {
-           if(shared[k]) {nsend++; shared[k]=nsend;}
-        }
-        shared[blockDim.x]=atomicAdd((int*) _buffer,nsend);
-      }
-      
-      __syncthreads();
-      
-      nsend=shared[blockDim.x]+shared[threadIdx.x]-1;
-      if(add&&nsend<maxlistlength)
-	      list[nsend] = i;
-
-    }	
-}
--- a/lib/cuda/compute_temp_cuda.cu
+++ b/lib/cuda/compute_temp_cuda.cu
@ -1,123 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX compute_temp_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "compute_temp_cuda_cu.h"
-#include "compute_temp_cuda_kernel.cu"
-
-void Cuda_ComputeTempCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-		int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_ComputeTempCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) );
-		if(sdata->atom.rmass_flag) 
-		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int) );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)       , & sdata->atom.type    .dev_data, sizeof(int*) );
-}
-
-void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_ComputeTempCuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t)
-{
-	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-		Cuda_ComputeTempCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	//if(sdata->buffer_new)
-		Cuda_ComputeTempCuda_UpdateBuffer(sdata);
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-	  Cuda_ComputeTempCuda_Vector_Kernel<<<grid, threads,threads.x*6*sizeof(ENERGY_FLOAT)>>> (groupbit);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: compute_vector Kernel execution failed");
-  
-	  int oldgrid=grid.x*grid.y;
-	  grid.x=6;
-	  grid.y=1;
-	  threads.x=512;
-      Cuda_ComputeTempCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Vector: reduce_vector Kernel execution failed");
-	}
-}
-
-void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t)
-{
-	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-		Cuda_ComputeTempCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	//if(sdata->buffer_new)
-		Cuda_ComputeTempCuda_UpdateBuffer(sdata);
-	MYDBG(printf("#CUDA ComputeTempCuda_Scalar: %i\n",sdata->atom.nlocal);)
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{	
-	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: pre compute_scalar Kernel");
-	  Cuda_ComputeTempCuda_Scalar_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (groupbit);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: compute_scalar Kernel execution failed");
-  
-	  int oldgrid=grid.x*grid.y;
-	  grid.x=1;
-	  grid.y=1;
-	  threads.x=512;
-      Cuda_ComputeTempCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempCuda_Scalar: reduce_scalar Kernel execution failed");
-	}
-}
--- a/lib/cuda/compute_temp_cuda_cu.h
+++ b/lib/cuda/compute_temp_cuda_cu.h
@ -1,28 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_ComputeTempCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_ComputeTempCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t);
-extern "C" void Cuda_ComputeTempCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t);
--- a/lib/cuda/compute_temp_cuda_kernel.cu
+++ b/lib/cuda/compute_temp_cuda_kernel.cu
@ -1,109 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ ENERGY_FLOAT sharedmem[];
-
-
-__global__ void Cuda_ComputeTempCuda_Scalar_Kernel(int groupbit)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-      sharedmem[threadIdx.x]=0;
-  if(i < _nlocal)
-  {
-    if (_rmass_flag) {
-      if (_mask[i] & groupbit)
-	    sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * _rmass[i];
-    } else {
-     if (_mask[i] & groupbit)
-	  sharedmem[threadIdx.x] = (_v[i]*_v[i] + _v[i+_nmax]*_v[i+_nmax] + _v[i+2*_nmax]*_v[i+2*_nmax]) * (_mass[_type[i]]);
-    }
-  }
-  reduceBlock(sharedmem);
-  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0];
-  }
-}
-
-__global__ void Cuda_ComputeTempCuda_Vector_Kernel(int groupbit)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-      sharedmem[threadIdx.x]=0;
-      sharedmem[threadIdx.x+blockDim.x]=0;
-      sharedmem[threadIdx.x+2*blockDim.x]=0;
-      sharedmem[threadIdx.x+3*blockDim.x]=0;
-      sharedmem[threadIdx.x+4*blockDim.x]=0;
-      sharedmem[threadIdx.x+5*blockDim.x]=0;
-   if(i < _nlocal)
-     if (_mask[i] & groupbit) {
-      V_FLOAT massone;
-      if (_rmass_flag) massone = _rmass[i];
-      else massone = _mass[_type[i]];
-      sharedmem[threadIdx.x] = massone * _v[i]*_v[i];
-      sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax];
-      sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax];
-      sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax];
-      sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax];
-      sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax];
-    }
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2*blockDim.x]);
-  reduceBlock(&sharedmem[3*blockDim.x]);
-  reduceBlock(&sharedmem[4*blockDim.x]);
-  reduceBlock(&sharedmem[5*blockDim.x]);
-  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[(blockIdx.x*gridDim.y+blockIdx.y)]=sharedmem[0];
-  	buffer[(blockIdx.x*gridDim.y+blockIdx.y)+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
-  	buffer[(blockIdx.x*gridDim.y+blockIdx.y)+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
-  	buffer[(blockIdx.x*gridDim.y+blockIdx.y)+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];
-  	buffer[(blockIdx.x*gridDim.y+blockIdx.y)+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x];
-  	buffer[(blockIdx.x*gridDim.y+blockIdx.y)+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x];
-  }
-}
-
-
-__global__ void Cuda_ComputeTempCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t)
-{
-	int i=0;
-    sharedmem[threadIdx.x]=0;
-    ENERGY_FLOAT myforig=0.0;
-    ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer;
-    buf=&buf[blockIdx.x*n];
-	while(i<n)
-	{
-      sharedmem[threadIdx.x]=0;
-	  if(i+threadIdx.x<n)
-      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
-      __syncthreads();
-	  reduceBlock(sharedmem);
-      i+=blockDim.x;
-      if(threadIdx.x==0)
-      myforig+=sharedmem[0];
-	}
-	if(threadIdx.x==0)
-	t[blockIdx.x]=myforig;
-}
--- a/lib/cuda/compute_temp_partial_cuda.cu
+++ b/lib/cuda/compute_temp_partial_cuda.cu
@ -1,161 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX compute_temp_partial_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "compute_temp_partial_cuda_cu.h"
-#include "compute_temp_partial_cuda_kernel.cu"
-
-void Cuda_ComputeTempPartialCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-		int size=(unsigned)((sdata->atom.nlocal+63)/64.0)*6*sizeof(ENERGY_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_ComputeTempPartialCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_ComputeTempPartialCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass .dev_data, sizeof(V_FLOAT*) );
-		if(sdata->atom.rmass_flag) 
-		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(rmass_flag)   , & sdata->atom.rmass_flag, sizeof(int) );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)       , & sdata->atom.type    .dev_data, sizeof(int*) );
-}
-
-void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag)
-{
-	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	//if(sdata->buffer_new)
-		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-	  Cuda_ComputeTempPartialCuda_Vector_Kernel<<<grid, threads,threads.x*6*sizeof(ENERGY_FLOAT)>>> (groupbit,xflag,yflag,zflag);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: compute_vector Kernel execution failed");
-  
-	  int oldgrid=grid.x*grid.y;
-	  grid.x=6;
-	  threads.x=512;
-      Cuda_ComputeTempPartialCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Vector: reduce_vector Kernel execution failed");
-	}
-}
-
-void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag)
-{
-	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	//if(sdata->buffer_new)
-		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-	MYDBG(printf("#CUDA ComputeTempPartialCuda_Scalar: %i\n",sdata->atom.nlocal);)
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{	
-	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: pre compute_scalar Kernel");
-	  Cuda_ComputeTempPartialCuda_Scalar_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (groupbit,xflag,yflag,zflag);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: compute_scalar Kernel execution failed");
-  
-	  int oldgrid=grid.x*grid.y;
-	  grid.x=1;
-	  threads.x=512;
-      Cuda_ComputeTempPartialCuda_Reduce_Kernel<<<grid, threads,threads.x*sizeof(ENERGY_FLOAT)>>> (oldgrid,t);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_Scalar: reduce_scalar Kernel execution failed");
-	}
-}
-
-void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall)
-{
-	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	//if(sdata->buffer_new)
-		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-	  Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
-	}
-}
-
-void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall)
-{
-	//if(sdata->atom.update_nmax) //is most likely not called every timestep, therefore update of constants is necessary
-		Cuda_ComputeTempPartialCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	//if(sdata->buffer_new)
-		Cuda_ComputeTempPartialCuda_UpdateBuffer(sdata);
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-	  Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,(V_FLOAT*) vbiasall);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_ComputeTempPartialCuda_RemoveBiasAll: compute_vector Kernel execution failed");
-	}
-}
--- a/lib/cuda/compute_temp_partial_cuda_cu.h
+++ b/lib/cuda/compute_temp_partial_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_ComputeTempPartialCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_ComputeTempPartialCuda_Vector(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag);
-extern "C" void Cuda_ComputeTempPartialCuda_Scalar(cuda_shared_data* sdata, int groupbit,ENERGY_FLOAT* t,int xflag,int yflag,int zflag);
-extern "C" void Cuda_ComputeTempPartialCuda_RemoveBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall);
-extern "C" void Cuda_ComputeTempPartialCuda_RestoreBiasAll(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,void* vbiasall);
--- a/lib/cuda/compute_temp_partial_cuda_kernel.cu
+++ b/lib/cuda/compute_temp_partial_cuda_kernel.cu
@ -1,152 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ ENERGY_FLOAT sharedmem[];
-
-
-__global__ void Cuda_ComputeTempPartialCuda_Scalar_Kernel(int groupbit,int xflag,int yflag,int zflag)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-      sharedmem[threadIdx.x]=0;
-  if(i < _nlocal)
-  {
-    if (_rmass_flag) {
-      if (_mask[i] & groupbit)
-	    sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * _rmass[i];
-    } else {
-     if (_mask[i] & groupbit)
-	  sharedmem[threadIdx.x] = (_v[i]*_v[i]*xflag + _v[i+_nmax]*_v[i+_nmax]*yflag + _v[i+2*_nmax]*_v[i+2*_nmax]*zflag) * (_mass[_type[i]]);
-    }
-  }
-  reduceBlock(sharedmem);
-  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
-  }
-}
-
-__global__ void Cuda_ComputeTempPartialCuda_Vector_Kernel(int groupbit,int xflag,int yflag,int zflag)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-      sharedmem[threadIdx.x]=0;
-      sharedmem[threadIdx.x+blockDim.x]=0;
-      sharedmem[threadIdx.x+2*blockDim.x]=0;
-      sharedmem[threadIdx.x+3*blockDim.x]=0;
-      sharedmem[threadIdx.x+4*blockDim.x]=0;
-      sharedmem[threadIdx.x+5*blockDim.x]=0;
-   if(i < _nlocal)
-     if (_mask[i] & groupbit) {
-      V_FLOAT massone;
-      if (_rmass_flag) massone = _rmass[i];
-      else massone = _mass[_type[i]];
-      sharedmem[threadIdx.x] = massone * _v[i]*_v[i]*xflag;
-      sharedmem[threadIdx.x+blockDim.x] = massone * _v[i+_nmax]*_v[i+_nmax]*yflag;
-      sharedmem[threadIdx.x+2*blockDim.x] = massone * _v[i+2*_nmax]*_v[i+2*_nmax]*zflag;
-      sharedmem[threadIdx.x+3*blockDim.x] = massone * _v[i]*_v[i+_nmax]*xflag*yflag;
-      sharedmem[threadIdx.x+4*blockDim.x] = massone * _v[i]*_v[i+2*_nmax]*xflag*zflag;
-      sharedmem[threadIdx.x+5*blockDim.x] = massone * _v[i+_nmax]*_v[i+2*_nmax]*yflag*zflag;
-    }
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2*blockDim.x]);
-  reduceBlock(&sharedmem[3*blockDim.x]);
-  reduceBlock(&sharedmem[4*blockDim.x]);
-  reduceBlock(&sharedmem[5*blockDim.x]);
-  ENERGY_FLOAT* buffer=(ENERGY_FLOAT*) _buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+4*gridDim.x*gridDim.y]=sharedmem[4*blockDim.x];
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+5*gridDim.x*gridDim.y]=sharedmem[5*blockDim.x];
-  }
-}
-
-
-__global__ void Cuda_ComputeTempPartialCuda_Reduce_Kernel(int n,ENERGY_FLOAT* t)
-{
-	int i=0;
-    sharedmem[threadIdx.x]=0;
-    ENERGY_FLOAT myforig=0.0;
-    ENERGY_FLOAT* buf=(ENERGY_FLOAT*) _buffer;
-    buf=&buf[blockIdx.x*n];
-	while(i<n)
-	{
-      sharedmem[threadIdx.x]=0;
-	  if(i+threadIdx.x<n)
-      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
-      __syncthreads();
-	  reduceBlock(sharedmem);
-      i+=blockDim.x;
-      if(threadIdx.x==0)
-      myforig+=sharedmem[0];
-	}
-	if(threadIdx.x==0)
-	t[blockIdx.x]=myforig;
-}
-
-__global__ void Cuda_ComputeTempPartialCuda_RemoveBiasAll_Kernel(int groupbit,int xflag,int yflag,int zflag,V_FLOAT* vbiasall)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   if(i < _nlocal)
-     if (_mask[i] & groupbit) {
-	 if(!xflag)
-	 {
-		vbiasall[i] = _v[i];
-		_v[i] = V_F(0.0);
-     }
-	 if(!yflag)
-	 {
-		vbiasall[i+_nmax] = _v[i+_nmax];
-		_v[i+_nmax] = V_F(0.0);
-     }
-	 if(!zflag)
-	 {
-		vbiasall[i+2*_nmax] = _v[i+2*_nmax];
-		_v[i+2*_nmax] = V_F(0.0);
-	 }
-     }
-}
-
-__global__ void Cuda_ComputeTempPartialCuda_RestoreBiasAll_Kernel(int groupbit,int xflag,int yflag,int zflag,V_FLOAT* vbiasall)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   if(i < _nlocal)
-     if (_mask[i] & groupbit) {
-	 if(!xflag)
-	 {
-		_v[i] += vbiasall[i];
-     }
-	 if(!yflag)
-	 {
-		_v[i+_nmax] += vbiasall[i+_nmax];
-     }
-	 if(!zflag)
-	 {
-		_v[i+2*_nmax] += vbiasall[i+2*_nmax];
-     }
-     }
-}
--- a/lib/cuda/crm_cuda_utils.cu
+++ b/lib/cuda/crm_cuda_utils.cu
@ -1,919 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   See the README file in the top-level LAMMPS directory.
-
-   -----------------------------------------------------------------------
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany
-
-   See the README file in the USER-CUDA directory.
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef CRM_CUDA_UTILS
-#define CRM_CUDA_UTILS
-
-//split n threads into 2 dimensional grid + threads, return values are grid.x grid.y and threads.x
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
-inline int3 getgrid(int n, int shared_per_thread = 0, int threadsmax = 256, bool p2 = false)
-{
-  int3 gridparams;
-  int sharedsize = 16000;
-
-  if(shared_per_thread > 0) threadsmax = sharedsize / shared_per_thread < threadsmax ? sharedsize / shared_per_thread : threadsmax;
-
-  if((n < 60 * 32) || (threadsmax < 64))
-    gridparams.z = 32;
-  else if((n < 60 * 64) || (threadsmax < 128))
-    gridparams.z = 64;
-  else if((n < 60 * 128) || (threadsmax < 256))
-    gridparams.z = 128;
-  else if((n < 60 * 256) || (threadsmax < 512))
-    gridparams.z = 256;
-  else gridparams.z = 512;
-
-  if(p2) {
-    gridparams.z = 16;
-
-    while(gridparams.z * 2 <= threadsmax) gridparams.z *= 2;
-  }
-
-
-  int blocks = (n + gridparams.z - 1) / gridparams.z;
-
-  if(blocks > 10000)
-    gridparams.x = gridparams.y = int(sqrt(blocks));
-  else {
-    gridparams.x = blocks;
-    gridparams.y = 1;
-  }
-
-  while(gridparams.x * gridparams.y * gridparams.z < n) gridparams.x++;
-
-  if(gridparams.x == 0) gridparams.x = 1;
-
-  return gridparams;
-}
-
-//return value: 1 if f<0; else: 0
-//take care if working with values as "blockId.x-n" for f: it might be interpreted as a unsigned int
-static inline __device__ int negativCUDA(float f)
-{
-  return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
-}
-
-//return value: -1 if f<0; else +1
-static inline __device__ float fsignCUDA(float f)
-{
-  return f < 0.0f ? -1.0f : 1.0f;
-}
-
-//functions to copy data between global and shared memory (indeed you can copy data between two arbitrary memory regims on device - as long as you have read respectively write rights)
-//blockDim.y and blockDim.z are assumed to be 1
-static inline __device__ void copySharedToGlob(int* shared, int* glob, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static inline __device__ void copySharedToGlob(float* shared, float* glob, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static inline __device__ void copySharedToGlob(double* shared, double* glob, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    glob[i + threadIdx.x] = shared[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static inline __device__ void copyGlobToShared(int* glob, int* shared, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyGlobToShared(float* glob, float* shared, const int &n)
-{
-  int i, k;
-  k = n - blockDim.x;
-
-  for(i = 0; i < k; i += blockDim.x) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyGlobToShared(double* glob, double* shared, const int &n)
-{
-  int i;
-
-  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  if(threadIdx.x < n - i) {
-    shared[i + threadIdx.x] = glob[i + threadIdx.x];
-  }
-
-  __syncthreads();
-}
-
-//copy data between two memory areas on device, 3d BlockDims are allowed
-static __device__ inline void copyData(double* source, double* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyData(float* source, float* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyData(int* source, int* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-static __device__ inline void copyData(unsigned int* source, unsigned int* target, const int &n)
-{
-  int i;
-  int offset = threadIdx.x * blockDim.y * blockDim.z + threadIdx.y * blockDim.z + threadIdx.z;
-
-  for(i = 0; i < n - blockDim.x * blockDim.y * blockDim.z; i += blockDim.x * blockDim.y * blockDim.z) {
-    target[i + offset] = source[i + offset];
-  }
-
-  if(offset < n - i) {
-    target[i + offset] = source[i + offset];
-  }
-
-  __syncthreads();
-}
-
-//functions in order to sum over values of one block. P2 means blockdim MUST be a power of 2 otherwise the behaviour is not well defined
-//in the end in data[0]=sum_i=0^blockDim.x data[i]
-//for reduceBlockP2 and reduceBlock blockDim.y=1 and blockDim.z=1
-static __device__ inline void reduceBlockP2(int* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlockP2(unsigned int* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlockP2(float* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlockP2(double* data)
-{
-  __syncthreads();
-
-  for(int i = 2; i <= blockDim.x; i *= 2) {
-    if(threadIdx.x < blockDim.x / i)
-      data[threadIdx.x] += data[threadIdx.x + blockDim.x / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(float* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(int* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(unsigned int* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduceBlock(double* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] += data[threadIdx.x + p2];
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] += data[threadIdx.x + p2 / i];
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void cudaFillBlockData_int(int* data, const int &n, const int &value)
-{
-  int i;
-
-  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
-    data[i + threadIdx.x] = value;
-  }
-
-  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
-}
-
-static __device__ inline void cudaFillBlockData_float(float* data, const int &n, const float &value)
-{
-  int i;
-
-  for(i = 0; i < n - blockDim.x; i += blockDim.x) {
-    data[i + threadIdx.x] = value;
-  }
-
-  if(threadIdx.x < n - i) data[i + threadIdx.x] = value;
-}
-
-static __device__ inline void reduce(float* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
-    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void reduce(double* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) * 2 < n - p2) {
-    data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2];
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] += data[(threadIdx.x + blockDim.x * j) + p2 / i];
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void minOfBlock(float* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfBlock(float* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void minOfBlock(double* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MIN(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MIN(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfBlock(double* data)
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < blockDim.x) p2 *= 2;
-
-  if(threadIdx.x < blockDim.x - p2)
-    data[threadIdx.x] = MAX(data[threadIdx.x + p2], data[threadIdx.x]);
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    if(threadIdx.x < p2 / i)
-      data[threadIdx.x] = MAX(data[threadIdx.x + p2 / i], data[threadIdx.x]);
-
-    __syncthreads();
-  }
-}
-
-
-static __device__ inline void minOfData(double* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfData(double* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void minOfData(float* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MIN(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-static __device__ inline void maxOfData(float* data, int n) //cautious not sure if working
-{
-  __syncthreads();
-  int p2 = 1;
-
-  while(p2 * 2 < n) p2 *= 2;
-
-  int j = 0;
-
-  while((threadIdx.x + blockDim.x * j) < n - p2) {
-    data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2]);
-    j++;
-  }
-
-  __syncthreads();
-
-  for(int i = 2; i <= p2; i *= 2) {
-    while((threadIdx.x + blockDim.x * j) < p2 / i) {
-      data[threadIdx.x + blockDim.x * j] = MAX(data[threadIdx.x + blockDim.x * j], data[(threadIdx.x + blockDim.x * j) + p2 / i]);
-      j++;
-    }
-
-    __syncthreads();
-  }
-}
-
-#if X_PRECISION == 2
-static __device__ inline double tex1Dfetch_double(texture<int2, 1> t, int i)
-{
-  int2 v = tex1Dfetch(t, i);
-  return __hiloint2double(v.y, v.x);
-}
-
-static __device__ inline X_FLOAT4 tex1Dfetch_double(texture<int4, 1> t, int i)
-{
-  int4 v = tex1Dfetch(t, 2 * i);
-  int4 u = tex1Dfetch(t, 2 * i + 1);
-  X_FLOAT4 w;
-
-  w.x = __hiloint2double(v.y, v.x);
-  w.y = __hiloint2double(v.w, v.z);
-  w.z = __hiloint2double(u.y, u.x);
-  w.w = __hiloint2double(u.w, u.z);
-  return w;
-}
-#endif
-
-inline void BindXTypeTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _x_type_tex.normalized = false;                      // access with normalized texture coordinates
-  _x_type_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _x_type_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* x_type_texture_ptr = &MY_AP(x_type_tex);
-
-#if X_PRECISION == 1
-  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * sizeof(X_FLOAT4));
-#else
-  cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0, x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax * 2 * sizeof(int4));
-#endif
-#endif
-}
-
-static __device__ inline X_FLOAT4 fetchXType(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if X_PRECISION == 1
-  return tex1Dfetch(_x_type_tex, i);
-#else
-  return tex1Dfetch_double(_x_type_tex, i);
-#endif
-#else
-  return _x_type[i];
-#endif
-}
-
-#if V_PRECISION == 2
-static __device__ inline double tex1Dfetch_double_v(texture<int2, 1> t, int i)
-{
-  int2 v = tex1Dfetch(t, i);
-  return __hiloint2double(v.y, v.x);
-}
-
-static __device__ inline V_FLOAT4 tex1Dfetch_double_v(texture<int4, 1> t, int i)
-{
-  int4 v = tex1Dfetch(t, 2 * i);
-  int4 u = tex1Dfetch(t, 2 * i + 1);
-  V_FLOAT4 w;
-
-  w.x = __hiloint2double(v.y, v.x);
-  w.y = __hiloint2double(v.w, v.z);
-  w.z = __hiloint2double(u.y, u.x);
-  w.w = __hiloint2double(u.w, u.z);
-  return w;
-}
-#endif
-
-inline void BindVRadiusTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _v_radius_tex.normalized = false;                      // access with normalized texture coordinates
-  _v_radius_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _v_radius_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* v_radius_texture_ptr = &MY_AP(v_radius_tex);
-
-#if V_PRECISION == 1
-  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * sizeof(X_FLOAT4));
-#else
-  cudaChannelFormatDesc channelDescVRadius = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0, v_radius_texture_ptr, sdata->atom.v_radius.dev_data, &channelDescVRadius, sdata->atom.nmax * 2 * sizeof(int4));
-#endif
-#endif
-}
-
-static __device__ inline V_FLOAT4 fetchVRadius(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if V_PRECISION == 1
-  return tex1Dfetch(_v_radius_tex, i);
-#else
-  return tex1Dfetch_double_v(_v_radius_tex, i);
-#endif
-#else
-  return _v_radius[i];
-#endif
-}
-
-inline void BindOmegaRmassTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _omega_rmass_tex.normalized = false;                      // access with normalized texture coordinates
-  _omega_rmass_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _omega_rmass_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* omega_rmass_texture_ptr = &MY_AP(omega_rmass_tex);
-
-#if V_PRECISION == 1
-  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<float4>();
-  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * sizeof(X_FLOAT4));
-#else
-  cudaChannelFormatDesc channelDescOmegaRmass = cudaCreateChannelDesc<int4>();
-  cudaBindTexture(0, omega_rmass_texture_ptr, sdata->atom.omega_rmass.dev_data, &channelDescOmegaRmass, sdata->atom.nmax * 2 * sizeof(int4));
-#endif
-#endif
-}
-
-static __device__ inline V_FLOAT4 fetchOmegaRmass(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if V_PRECISION == 1
-  return tex1Dfetch(_omega_rmass_tex, i);
-#else
-  return tex1Dfetch_double_v(_omega_rmass_tex, i);
-#endif
-#else
-  return _omega_rmass[i];
-#endif
-}
-
-#if F_PRECISION == 2
-static __device__ inline double tex1Dfetch_double_f(texture<int2, 1> t, int i)
-{
-  int2 v = tex1Dfetch(t, i);
-  return __hiloint2double(v.y, v.x);
-}
-
-static __device__ inline F_FLOAT4 tex1Dfetch_double_f(texture<int4, 1> t, int i)
-{
-  int4 v = tex1Dfetch(t, 2 * i);
-  int4 u = tex1Dfetch(t, 2 * i + 1);
-  F_FLOAT4 w;
-
-  w.x = __hiloint2double(v.y, v.x);
-  w.y = __hiloint2double(v.w, v.z);
-  w.z = __hiloint2double(u.y, u.x);
-  w.w = __hiloint2double(u.w, u.z);
-  return w;
-}
-#endif
-
-inline void BindQTexture(cuda_shared_data* sdata)
-{
-#ifdef CUDA_USE_TEXTURE
-  _q_tex.normalized = false;                      // access with normalized texture coordinates
-  _q_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-  _q_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-  const textureReference* q_texture_ptr = &MY_AP(q_tex);
-
-#if F_PRECISION == 1
-  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<float>();
-  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(F_FLOAT));
-#else
-  cudaChannelFormatDesc channelDescQ = cudaCreateChannelDesc<int2>();
-  cudaBindTexture(0, q_texture_ptr, sdata->atom.q.dev_data, &channelDescQ, sdata->atom.nmax * sizeof(int2));
-#endif
-#endif
-}
-
-static __device__ inline F_FLOAT fetchQ(int i)
-{
-#ifdef CUDA_USE_TEXTURE
-#if F_PRECISION == 1
-  return tex1Dfetch(_q_tex, i);
-#else
-  return tex1Dfetch_double_f(_q_tex, i);
-#endif
-#else
-  return _q[i];
-#endif
-}
-
-#endif
-
-/*
-
-inline void BindPairCoeffTypeTexture(cuda_shared_data* sdata,coeff_tex)
-{
-	#ifdef CUDA_USE_TEXTURE
-		_coeff_tex.normalized = false;                      // access with normalized texture coordinates
-		_coeff_tex.filterMode = cudaFilterModePoint;        // Point mode, so no
-		_coeff_tex.addressMode[0] = cudaAddressModeWrap;    // wrap texture coordinates
-		const textureReference* coeff_texture_ptr;
-		cudaGetTextureReference(&coeff_texture_ptr, &MY_AP(coeff_tex));
-
-		#if F_PRECISION == 1
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<float4>();
-		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*sizeof(X_FLOAT4));
-		#else
-		cudaChannelFormatDesc channelDescXType = cudaCreateChannelDesc<int4>();
-		cudaBindTexture(0,x_type_texture_ptr, sdata->atom.x_type.dev_data, &channelDescXType, sdata->atom.nmax*2*sizeof(int4));
-		#endif
-	#endif
-}
-
-static __device__ inline X_FLOAT4 fetchXType(int i)
-{
-		#ifdef CUDA_USE_TEXTURE
-		  #if X_PRECISION == 1
-		     return tex1Dfetch(_x_type_tex,i);
-		  #else
-		     return tex1Dfetch_double(_x_type_tex,i);
-		  #endif
-		#else
-		  return _x_type[i];
-		#endif
-}
-*/
-#define SBBITS 30
-
-static inline __device__ int sbmask(int j)
-{
-  return j >> SBBITS & 3;
-}
-
-static inline __device__ void minimum_image(X_FLOAT4 &delta)
-{
-  if(_triclinic == 0) {
-    if(_periodicity[0]) {
-      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
-                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
-    }
-
-    if(_periodicity[1]) {
-      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
-                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
-    }
-
-    if(_periodicity[2]) {
-      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
-    }
-
-  } else {
-    if(_periodicity[1]) {
-      delta.z += delta.z < -X_F(0.5) * _prd[2] ? _prd[2] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_prd[2] : X_F(0.0));
-      delta.y += delta.z < -X_F(0.5) * _prd[2] ? _h[3] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[3] : X_F(0.0));
-      delta.x += delta.z < -X_F(0.5) * _prd[2] ? _h[4] :
-                 (delta.z >  X_F(0.5) * _prd[2] ? -_h[4] : X_F(0.0));
-
-    }
-
-    if(_periodicity[1]) {
-      delta.y += delta.y < -X_F(0.5) * _prd[1] ? _prd[1] :
-                 (delta.y >  X_F(0.5) * _prd[1] ? -_prd[1] : X_F(0.0));
-      delta.x += delta.y < -X_F(0.5) * _prd[1] ? _h[5] :
-                 (delta.y >  X_F(0.5) * _prd[1] ? -_h[5] : X_F(0.0));
-
-    }
-
-    if(_periodicity[0]) {
-      delta.x += delta.x < -X_F(0.5) * _prd[0] ? _prd[0] :
-                 (delta.x >  X_F(0.5) * _prd[0] ? -_prd[0] : X_F(0.0));
-    }
-  }
-}
-
-static inline __device__ void closest_image(X_FLOAT4 &x1, X_FLOAT4 &x2, X_FLOAT4 &ci)
-{
-  ci.x = x2.x - x1.x;
-  ci.y = x2.y - x1.y;
-  ci.z = x2.z - x1.z;
-  minimum_image(ci);
-  ci.x += x1.x;
-  ci.y += x1.y;
-  ci.z += x1.z;
-}
--- a/lib/cuda/cuda.cu
+++ b/lib/cuda/cuda.cu
@ -1,22 +0,0 @@
-#include "cuda_precision.h"
-#include "cuda_shared.h"
-#include "cuda_cu.h"
-
-void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata)
-{
-	sdata->compile_settings.prec_glob=sizeof(CUDA_FLOAT)/4;
-	sdata->compile_settings.prec_x=sizeof(X_FLOAT)/4;
-	sdata->compile_settings.prec_v=sizeof(V_FLOAT)/4;
-	sdata->compile_settings.prec_f=sizeof(F_FLOAT)/4;
-	sdata->compile_settings.prec_pppm=sizeof(PPPM_FLOAT)/4;
-	sdata->compile_settings.prec_fft=sizeof(FFT_FLOAT)/4;
-    
-    #ifdef FFT_CUFFT
-      sdata->compile_settings.cufft=1;
-    #else	
-      sdata->compile_settings.cufft=0;
-    #endif
-    
-    sdata->compile_settings.arch=CUDA_ARCH;
-    
-}
--- a/lib/cuda/cuda_common.h
+++ b/lib/cuda/cuda_common.h
@ -1,344 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef _CUDA_COMMON_H_
-#define _CUDA_COMMON_H_
-
-//#include "cutil.h"
-#include "cuda_precision.h"
-#include "cuda_wrapper_cu.h"
-
-#define CUDA_MAX_TYPES_PLUS_ONE 12 //for pair styles which use constant space for parameters, this needs to be one larger than the number of atom types
-//this can not be arbitrarly large, since constant space is limited. 
-//in principle one could alter potentials to use global memory for parameters, some du that already since the first examples I encountered had a high number (20+) of atom types
-//Christian   
-#define CUDA_MAX_TYPES2 (CUDA_MAX_TYPES_PLUS_ONE * CUDA_MAX_TYPES_PLUS_ONE)
-#define CUDA_MAX_NSPECIAL 25
-
-// define some easy-to-use debug and emulation macros
-#ifdef _DEBUG
-#define MYDBG(a) a
-#else
-#define MYDBG(a) 
-#endif
-
-#if __DEVICE_EMULATION__
-#define MYEMU(a) a
-#else
-#define MYEMU(a) 
-#endif
-
-#define MYEMUDBG(a) MYEMU(MYDBG(a))
-
-// Add Prefix (needed as workaround, same constant's names in different files causes conflict)
-#define MY_ADD_PREFIX(prefix, var) prefix##_##var
-#define MY_ADD_PREFIX2(prefix, var) MY_ADD_PREFIX(prefix, var)
-#define MY_AP(var) MY_ADD_PREFIX2(MY_PREFIX, var)
-
-#define MY_VAR_TO_STR(var) #var
-#define MY_VAR_TO_STR2(var) MY_VAR_TO_STR(var)
-#define MY_CONST(var) (MY_VAR_TO_STR2(MY_PREFIX) "_" MY_VAR_TO_STR2(var))
-
-#define CUDA_USE_TEXTURE
-#define CUDA_USE_FLOAT4
-
-//constants used by many classes
-
-//domain
-#define _boxhi       MY_AP(boxhi)
-#define _boxlo       MY_AP(boxlo)
-#define _subhi       MY_AP(subhi)
-#define _sublo       MY_AP(sublo)
-#define _box_size    MY_AP(box_size)
-#define _prd         MY_AP(prd)
-#define _periodicity MY_AP(periodicity)
-#define _triclinic	 MY_AP(triclinic)
-#define _boxhi_lamda MY_AP(boxhi_lamda)
-#define _boxlo_lamda MY_AP(boxlo_lamda)
-#define _prd_lamda   MY_AP(prd_lamda)
-#define _h		 	 MY_AP(h)
-#define _h_inv	 	 MY_AP(h_inv)
-#define _h_rate		 MY_AP(h_rate)
-__device__ __constant__ X_FLOAT _boxhi[3];
-__device__ __constant__ X_FLOAT _boxlo[3];
-__device__ __constant__ X_FLOAT _subhi[3];
-__device__ __constant__ X_FLOAT _sublo[3];
-__device__ __constant__ X_FLOAT _box_size[3];
-__device__ __constant__ X_FLOAT _prd[3];
-__device__ __constant__ int _periodicity[3];
-__device__ __constant__ int _triclinic;
-__device__ __constant__ X_FLOAT _boxhi_lamda[3];
-__device__ __constant__ X_FLOAT _boxlo_lamda[3];
-__device__ __constant__ X_FLOAT _prd_lamda[3];
-__device__ __constant__ X_FLOAT _h[6];
-__device__ __constant__ X_FLOAT _h_inv[6];
-__device__ __constant__ V_FLOAT _h_rate[6];
-
-
-//atom properties
-#define _x           MY_AP(x)
-#define _v           MY_AP(v)
-#define _f           MY_AP(f)
-#define _tag         MY_AP(tag)
-#define _type        MY_AP(type)
-#define _mask        MY_AP(mask)
-#define _image       MY_AP(image)
-#define _q           MY_AP(q)
-#define _mass        MY_AP(mass)
-#define _rmass       MY_AP(rmass)
-#define _rmass_flag  MY_AP(rmass_flag)
-#define _eatom       MY_AP(eatom)
-#define _vatom       MY_AP(vatom)
-#define _x_type      MY_AP(x_type)
-#define _radius      MY_AP(radius)
-#define _density     MY_AP(density)
-#define _omega       MY_AP(omega)
-#define _torque      MY_AP(torque)
-#define _special     MY_AP(special)
-#define _maxspecial  MY_AP(maxspecial)
-#define _nspecial    MY_AP(nspecial)
-#define _special_flag  MY_AP(special_flag)
-#define _molecule    MY_AP(molecule)
-#define _v_radius    MY_AP(v_radius)
-#define _omega_rmass MY_AP(omega_rmass)
-#define _freeze_group_bit MY_AP(freeze_group_bit)
-#define _map_array   MY_AP(map_array)
-__device__ __constant__ X_FLOAT* _x;  //holds pointer to positions
-__device__ __constant__ V_FLOAT* _v;
-__device__ __constant__ F_FLOAT* _f;
-__device__ __constant__ int* _tag;
-__device__ __constant__ int* _type;
-__device__ __constant__ int* _mask;
-__device__ __constant__ int* _image;
-__device__ __constant__ V_FLOAT* _mass;
-__device__ __constant__ F_FLOAT* _q;
-__device__ __constant__ V_FLOAT* _rmass;
-__device__ __constant__ int _rmass_flag;
-__device__ __constant__ ENERGY_FLOAT* _eatom;
-__device__ __constant__ ENERGY_FLOAT* _vatom;
-__device__ __constant__ X_FLOAT4* _x_type;  //holds pointer to positions
-__device__ __constant__ X_FLOAT* _radius;  
-__device__ __constant__ F_FLOAT* _density;  
-__device__ __constant__ V_FLOAT* _omega;  
-__device__ __constant__ F_FLOAT* _torque;  
-__device__ __constant__ int* _special;
-__device__ __constant__ int _maxspecial;
-__device__ __constant__ int* _nspecial;
-__device__ __constant__ int _special_flag[4];
-__device__ __constant__ int* _molecule;
-__device__ __constant__ V_FLOAT4* _v_radius;  //holds pointer to positions
-__device__ __constant__ V_FLOAT4* _omega_rmass;  //holds pointer to positions
-__device__ __constant__ int _freeze_group_bit;
-__device__ __constant__ int* _map_array;
-
-#ifdef CUDA_USE_TEXTURE
-	
-	#define _x_tex         MY_AP(x_tex)
-	#if X_PRECISION == 1
-	texture<float> _x_tex;
-	#else
-	texture<int2,1> _x_tex;
-	#endif
-	
-	#define _type_tex         MY_AP(type_tex)
-	texture<int> _type_tex;
-	
-	#define _x_type_tex         MY_AP(x_type_tex)
-	#if X_PRECISION == 1
-	texture<float4,1> _x_type_tex;
-	#else
-	texture<int4,1> _x_type_tex;
-	#endif
-	
-	#define _v_radius_tex         MY_AP(v_radius_tex)
-	#if V_PRECISION == 1
-	texture<float4,1> _v_radius_tex;
-	#else
-	texture<int4,1> _v_radius_tex;
-	#endif
-	
-	#define _omega_rmass_tex         MY_AP(omega_rmass_tex)
-	#if V_PRECISION == 1
-	texture<float4,1> _omega_rmass_tex;
-	#else
-	texture<int4,1> _omega_rmass_tex;
-	#endif
-
-	#define _q_tex         MY_AP(q_tex)
-	#if F_PRECISION == 1
-	texture<float> _q_tex;
-	#else
-	texture<int2,1> _q_tex;
-	#endif
-
-#endif
-
-//neighbor
-#ifdef IncludeCommonNeigh
-#define _inum        	MY_AP(inum)
-#define _inum_border    MY_AP(inum_border)
-#define _ilist       	MY_AP(ilist)
-#define _ilist_border 	MY_AP(ilist_border)
-#define _numneigh    	MY_AP(numneigh)
-#define _numneigh_border 	MY_AP(numneigh_border)
-#define _numneigh_inner		MY_AP(numneigh_inner)
-#define _firstneigh  	MY_AP(firstneigh)
-#define _neighbors 	MY_AP(neighbors)
-#define _neighbors_border 	MY_AP(neighbors_border)
-#define _neighbors_inner  	MY_AP(neighbors_inner)
-#define _reneigh_flag 	MY_AP(reneigh_flag)
-#define _triggerneighsq MY_AP(triggerneighsq)
-#define _xhold       	MY_AP(xhold)
-#define _maxhold     	MY_AP(maxhold)
-#define _dist_check     MY_AP(dist_check)
-#define _neighbor_maxlocal MY_AP(neighbor_maxlocal)
-#define _maxneighbors   MY_AP(maxneighbors)
-#define _overlap_comm   MY_AP(overlap_comm)
-__device__ __constant__ int _inum;
-__device__ __constant__ int* _inum_border;
-__device__ __constant__ int* _ilist;
-__device__ __constant__ int* _ilist_border;
-__device__ __constant__ int* _numneigh;
-__device__ __constant__ int* _numneigh_border;
-__device__ __constant__ int* _numneigh_inner;
-__device__ __constant__ int** _firstneigh;
-__device__ __constant__ int* _neighbors;
-__device__ __constant__ int* _neighbors_border;
-__device__ __constant__ int* _neighbors_inner;
-__device__ __constant__ int* _reneigh_flag;
-__device__ __constant__ X_FLOAT _triggerneighsq;
-__device__ __constant__ X_FLOAT* _xhold;  //holds pointer to positions
-__device__ __constant__ int _maxhold;
-__device__ __constant__ int _dist_check;
-__device__ __constant__ int _neighbor_maxlocal;
-__device__ __constant__ int _maxneighbors;
-__device__ __constant__ int _overlap_comm;
-#endif
-
-//system properties
-#define _nall        MY_AP(nall)
-#define _nghost      MY_AP(nghost)
-#define _nlocal      MY_AP(nlocal)
-#define _nmax        MY_AP(nmax)
-#define _cuda_ntypes MY_AP(cuda_ntypes)
-#define _dtf         MY_AP(dtf)
-#define _dtv         MY_AP(dtv)
-#define _factor      MY_AP(factor)
-#define _virial      MY_AP(virial)
-#define _eng_vdwl    MY_AP(eng_vdwl)
-#define _eng_coul    MY_AP(eng_coul)
-#define _molecular   MY_AP(molecular)
-__device__ __constant__ unsigned _nall;
-__device__ __constant__ unsigned _nghost;
-__device__ __constant__ unsigned _nlocal;
-__device__ __constant__ unsigned _nmax;
-__device__ __constant__ unsigned _cuda_ntypes;
-__device__ __constant__ V_FLOAT _dtf;
-__device__ __constant__ X_FLOAT _dtv;
-__device__ __constant__ V_FLOAT _factor;
-__device__ __constant__ ENERGY_FLOAT* _virial;
-__device__ __constant__ ENERGY_FLOAT* _eng_vdwl;
-__device__ __constant__ ENERGY_FLOAT* _eng_coul;
-__device__ __constant__ int _molecular;
-
-//other general constants
-#define _buffer      MY_AP(buffer)
-#define _flag		 MY_AP(flag)
-#define _debugdata   MY_AP(debugdata)
-__device__ __constant__ void* _buffer;
-__device__ __constant__ int* _flag;
-__device__ __constant__ int* _debugdata;
-
-// pointers to data fields on GPU are hold in constant space 
-// -> reduces register usage and number of parameters for kernelcalls 
-// will be variables of file scope in cuda files
-
-
-
-
-// maybe used to output cudaError_t
-#define MY_OUTPUT_RESULT(result) \
-	switch(result) \
-	{ \
-		case cudaSuccess: printf(" => cudaSuccess\n"); break; \
-		case cudaErrorInvalidValue: printf(" => cudaErrorInvalidValue\n"); break; \
-		case cudaErrorInvalidSymbol: printf(" => cudaErrorInvalidSymbol\n"); break; \
-		case cudaErrorInvalidDevicePointer: printf(" => cudaErrorInvalidDevicePointer\n"); break; \
-		case cudaErrorInvalidMemcpyDirection: printf(" => cudaErrorInvalidMemcpyDirection\n"); break; \
-		default: printf(" => unknown\n"); break; \
-	}
-
-#ifdef _DEBUG
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    err = cudaThreadSynchronize();                                           \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    }
-#else
-#  define CUT_CHECK_ERROR(errorMessage) {                                    \
-    cudaError_t err = cudaGetLastError();                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
-                errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
-        exit(EXIT_FAILURE);                                                  \
-    }                                                                        \
-    }
-#endif
-
-#  define CUDA_SAFE_CALL_NO_SYNC( call) {                                    \
-    cudaError err = call;                                                    \
-    if( cudaSuccess != err) {                                                \
-        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
-                __FILE__, __LINE__, cudaGetErrorString( err) );              \
-        exit(EXIT_FAILURE);                                                  \
-    } }
-
-#  define CUDA_SAFE_CALL( call)     CUDA_SAFE_CALL_NO_SYNC(call);
-
-#define X_MASK 1
-#define V_MASK 2
-#define F_MASK 4
-#define TAG_MASK 8
-#define TYPE_MASK 16
-#define MASK_MASK 32
-#define IMAGE_MASK 64
-#define Q_MASK 128
-#define MOLECULE_MASK 256
-#define RMASS_MASK 512
-#define RADIUS_MASK 1024
-#define DENSITY_MASK 2048
-#define OMEGA_MASK 4096
-#define TORQUE_MASK 8192
-
-
-
-#endif // #ifdef _CUDA_COMMON_H_
--- a/lib/cuda/cuda_cu.h
+++ b/lib/cuda/cuda_cu.h
@ -1 +0,0 @@
-extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);
--- a/lib/cuda/cuda_data.cu
+++ b/lib/cuda/cuda_data.cu
@ -1,168 +0,0 @@
-enum copy_mode {x, xx, xy, yx, xyz, xzy}; // yxz, yzx, zxy, zyx not yet implemented since they were not needed yet
-
-#include "cuda_data_cu.h"
-#include "cuda_wrapper_cu.h"
-#include "cuda_data_kernel.cu"
-#include <cstdio>
-
-void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
-{
-	int size=n[0];
-	if(n[1]>0) size*=n[1];
-	if(n[2]>0) size*=n[2];
-	
-	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
-	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
-	
-	if(size<=128*30)
-	threads.x=32;
-	else if(size<=256*30)
-	threads.x=64;
-	else if(size<=512*30)
-	threads.x=128;
-	else 
-	threads.x=256;
-	
-	grid.x=((size-1)+threads.x)/threads.x;
-	if(grid.x>32000)
-	grid.x=32000;
-	while(grid.x*grid.y*threads.x<size) grid.y++; 
-	float debugdata[size];
-	//int* cu_debug=(int*) CudaWrapper_AllocCudaData(size*sizeof(FLOAT));
-	size*=sizeof(double);
-	printf("size: %i (%i %i %i) (%i %i %i) %p\n",size,grid.x,grid.y,threads.x,n[0],n[1],n[2],buffer);
-	CudaWrapper_UploadCudaData(host_data, buffer, size);
-	CudaData_Upload_Kernel_DoubleFloat<<<grid,threads>>>((double*)buffer,(float*)dev_data,n[0],n[1],n[2],mode);
-	cudaThreadSynchronize();
-	CudaWrapper_DownloadCudaData(debugdata, dev_data, size/2);
-	double sum=0;
-	printf("debugdata: ");
-	for(int i=0;i<size/sizeof(double);i++) sum+=(debugdata[i]-((double*) host_data)[i])*(debugdata[i]-((double*) host_data)[i]);
-	
-	printf("%lf \n",sum);
-	
-}
-
-void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
-{
-	int size=n[0];
-	if(n[1]>0) size*=n[1];
-	if(n[2]>0) size*=n[2];
-	
-	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
-	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
-	
-	if(size<=128*30)
-	threads.x=32;
-	else if(size<=256*30)
-	threads.x=64;
-	else if(size<=512*30)
-	threads.x=128;
-	else 
-	threads.x=256;
-	
-	grid.x=((size-1)+threads.x)/threads.x;
-	if(grid.x>32000)
-	grid.x=32000;
-	while(grid.x*grid.y*threads.x<size) grid.y++; 
-	
-	size*=sizeof(double);
-	
-	CudaWrapper_UploadCudaData(host_data, buffer, size);
-	CudaData_Upload_Kernel_DoubleDouble<<<grid,threads>>>((double*)buffer,(double*)dev_data,n[0],n[1],n[2],mode);
-	cudaThreadSynchronize();
-}
-
-void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
-{
-	int size=n[0];
-	if(n[1]>0) size*=n[1];
-	if(n[2]>0) size*=n[2];
-	
-	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
-	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
-	
-	if(size<=128*30)
-	threads.x=32;
-	else if(size<=256*30)
-	threads.x=64;
-	else if(size<=512*30)
-	threads.x=128;
-	else 
-	threads.x=256;
-	
-	grid.x=((size-1)+threads.x)/threads.x;
-	if(grid.x>32000)
-	grid.x=32000;
-	while(grid.x*grid.y*threads.x<size) grid.y++; 
-	
-	size*=sizeof(float);
-	
-	CudaWrapper_UploadCudaData(host_data, buffer, size);
-	CudaData_Upload_Kernel_FloatDouble<<<grid,threads>>>((float*)buffer,(double*)dev_data,n[0],n[1],n[2],mode);
-	cudaThreadSynchronize();
-}
-
-void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
-{
-	int size=n[0];
-	if(n[1]>0) size*=n[1];
-	if(n[2]>0) size*=n[2];
-	
-	dim3 threads; threads.x=1; threads.y=1; threads.z=1;
-	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
-	
-	if(size<=128*30)
-	threads.x=32;
-	else if(size<=256*30)
-	threads.x=64;
-	else if(size<=512*30)
-	threads.x=128;
-	else 
-	threads.x=256;
-	
-	grid.x=((size-1)+threads.x)/threads.x;
-	if(grid.x>32000)
-	grid.x=32000;
-	while(grid.x*grid.y*threads.x<size) grid.y++; 
-	
-	size*=sizeof(float);
-	
-	CudaWrapper_UploadCudaData(host_data, buffer, size);
-	CudaData_Upload_Kernel_FloatFloat<<<grid,threads>>>((float*)buffer,(float*)dev_data,n[0],n[1],n[2],mode);
-	cudaThreadSynchronize();
-}
-
-void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer)
-{
-	int size=n[0];
-	if(n[1]>0) size*=n[1];
-	if(n[2]>0) size*=n[2];
-	
-    dim3 threads; threads.x=1; threads.y=1; threads.z=1;
-	dim3 grid; grid.x=1; grid.y=1; grid.z=1;
-	
-	if(size<=128*30)
-	threads.x=32;
-	else if(size<=256*30)
-	threads.x=64;
-	else if(size<=512*30)
-	threads.x=128;
-	else 
-	threads.x=256;
-	
-	grid.x=((size-1)+threads.x)/threads.x;
-	if(grid.x>32000)
-	grid.x=32000;
-	while(grid.x*grid.y*threads.x<size) grid.y++; 
-	
-	size*=sizeof(int);
-	
-	CudaWrapper_UploadCudaData(host_data, buffer, size);
-	CudaData_Upload_Kernel_IntInt<<<grid,threads>>>((int*)buffer,(int*)dev_data,n[0],n[1],n[2],mode);
-	cudaThreadSynchronize();
-}
-
-void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer)
-{
-}
--- a/lib/cuda/cuda_data_cu.h
+++ b/lib/cuda/cuda_data_cu.h
@ -1,13 +0,0 @@
-#ifndef CUDA_DATA_CU_H_
-#define CUDA_DATA_CU_H_
-
-extern "C" void CudaData_Upload_DoubleFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
-extern "C" void CudaData_Upload_DoubleDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
-extern "C" void CudaData_Upload_FloatDouble(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
-extern "C" void CudaData_Upload_FloatFloat(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
-extern "C" void CudaData_Upload_IntInt(void* host_data,void* dev_data, unsigned* n,copy_mode mode,void* buffer);
-
-extern "C" void CudaData_Download(void* host_data,void* dev_data,int host_size, int dev_size, unsigned* n,copy_mode mode,void* buffer);
-
-
-#endif /*CUDA_DATA_CU_H_*/
--- a/lib/cuda/cuda_data_kernel.cu
+++ b/lib/cuda/cuda_data_kernel.cu
@ -1,156 +0,0 @@
-__global__ void CudaData_Upload_Kernel_DoubleFloat(double* buffer,float* dev_data,
-						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
-{
-  if(mode==x) mode=xx;
-  unsigned length=nx;
-  if(ny>0) length*=ny;
-  if(nz>0) length*=nz;
-  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
-
-  
-  if(i>=length) return;
-  switch(mode)
-  {
-     case xx:
-     {
-       dev_data[i]=buffer[i];
-     }
-     case xy:
-     {
-       dev_data[i]=buffer[i];
-     }
-     case yx:
-       {
-       j=i/ny;
-       k=i%ny;
-       dev_data[k*nx+j]=buffer[j*ny+k];
-       }
-     case xyz:
-     {
-       dev_data[i]=buffer[i];
-     }
-     case xzy:
-       {
-       	j=i/(ny*nz);
-       k=(i%(ny*nz))/nz;
-       l=i%nz;
-       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
-       }
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_DoubleDouble(double* buffer,double* dev_data,
-						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
-{
-  if(mode==x) mode=xx;
-  unsigned length=nx;
-  if(ny>0) length*=ny;
-  if(nz>0) length*=nz;
-  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
-  if(i>=length) return;
-  switch(mode)
-  {
-     case xx:
-       dev_data[i]=buffer[i];
-     case xy:
-       dev_data[i]=buffer[i];
-     case yx:
-       j=i/ny;
-       k=i%ny;
-       dev_data[k*nx+j]=buffer[j*ny+k];
-     case xyz:
-       dev_data[i]=buffer[i];
-     case xzy:
-       j=i/(ny*nz);
-       k=(i%(ny*nz))/nz;
-       l=i%nz;
-       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_FloatDouble(float* buffer,double* dev_data,
-						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
-{
-  if(mode==x) mode=xx;
-  unsigned length=nx;
-  if(ny>0) length*=ny;
-  if(nz>0) length*=nz;
-  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
-  if(i>=length) return;
-  switch(mode)
-  {
-     case xx:
-       dev_data[i]=buffer[i];
-     case xy:
-       dev_data[i]=buffer[i];
-     case yx:
-       j=i/ny;
-       k=i%ny;
-       dev_data[k*nx+j]=buffer[j*ny+k];
-     case xyz:
-       dev_data[i]=buffer[i];
-     case xzy:
-       j=i/(ny*nz);
-       k=(i%(ny*nz))/nz;
-       l=i%nz;
-       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_FloatFloat(float* buffer,float* dev_data,
-						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
-{
-  if(mode==x) mode=xx;
-  unsigned length=nx;
-  if(ny>0) length*=ny;
-  if(nz>0) length*=nz;
-  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
-  if(i>=length) return;
-  switch(mode)
-  {
-     case xx:
-       dev_data[i]=buffer[i];
-     case xy:
-       dev_data[i]=buffer[i];
-     case yx:
-       j=i/ny;
-       k=i%ny;
-       dev_data[k*nx+j]=buffer[j*ny+k];
-     case xyz:
-       dev_data[i]=buffer[i];
-     case xzy:
-       j=i/(ny*nz);
-       k=(i%(ny*nz))/nz;
-       l=i%nz;
-       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
-  }
-}
-
-__global__ void CudaData_Upload_Kernel_IntInt(int* buffer,int* dev_data,
-						unsigned nx,unsigned ny,unsigned nz,copy_mode mode)
-{
-  if(mode==x) mode=xx;
-  unsigned length=nx;
-  if(ny>0) length*=ny;
-  if(nz>0) length*=nz;
-  unsigned i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x,j,k,l;
-  if(i>=length) return;
-  switch(mode)
-  {
-     case xx:
-       dev_data[i]=buffer[i];
-     case xy:
-       dev_data[i]=buffer[i];
-     case yx:
-       j=i/ny;
-       k=i%ny;
-       dev_data[k*nx+j]=buffer[j*ny+k];
-     case xyz:
-       dev_data[i]=buffer[i];
-     case xzy:
-       j=i/(ny*nz);
-       k=(i%(ny*nz))/nz;
-       l=i%nz;
-       dev_data[j*ny*nz+l*ny+k]=buffer[j*ny*nz+k*nz+l];
-  }
-}
--- a/lib/cuda/cuda_kernel.cu
+++ b/lib/cuda/cuda_kernel.cu
--- a/lib/cuda/cuda_pair.cu
+++ b/lib/cuda/cuda_pair.cu
--- a/lib/cuda/cuda_pair_cu.h
+++ b/lib/cuda/cuda_pair_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-#include "cuda_shared.h"
-
-extern "C" void Cuda_Pair_GenerateXType(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_RevertXType(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_GenerateVRadius(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_GenerateOmegaRmass(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_BuildXHold(cuda_shared_data* sdata);
-extern "C" void Cuda_Pair_CollectForces(cuda_shared_data* sdata,int eflag, int vflag);
--- a/lib/cuda/cuda_pair_kernel.cu
+++ b/lib/cuda/cuda_pair_kernel.cu
--- a/lib/cuda/cuda_pair_virial_kernel_nc.cu
+++ b/lib/cuda/cuda_pair_virial_kernel_nc.cu
@ -1,126 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ ENERGY_FLOAT sharedmem[];
-
-static inline __device__ void PairVirialCompute_A_Kernel(int eflag,int vflag,int coulflag=0)
-{
-	__syncthreads();
-	ENERGY_FLOAT* shared=sharedmem;
-
-	if(eflag)
-	{
-	  reduceBlock(shared);
-	  shared+=blockDim.x;
-	  if(coulflag)
-	  {
-	  	reduceBlock(shared);
-	    shared+=blockDim.x;
-	  }
-	} 
-	if(vflag)
-	{
-	  reduceBlock(shared + 0 * blockDim.x);
-	  reduceBlock(shared + 1 * blockDim.x);
-	  reduceBlock(shared + 2 * blockDim.x);
-	  reduceBlock(shared + 3 * blockDim.x);
-	  reduceBlock(shared + 4 * blockDim.x);
-	  reduceBlock(shared + 5 * blockDim.x);
-	}
-	if(threadIdx.x == 0) 
-	{
-	    shared=sharedmem;
-	    ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
-		if(eflag)
-		{
-		  buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0];
-	  	  shared+=blockDim.x; buffer+=gridDim.x * gridDim.y;
-	  	  if(coulflag)
-	  	  {
-		    buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5)*shared[0];
-	  	    shared+=blockDim.x; buffer+=gridDim.x * gridDim.y;
-	  	  } 
-		}	
-		if(vflag)
-		{
-		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[0 * blockDim.x];	
-		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[1 * blockDim.x];	
-		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[2 * blockDim.x];	
-		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[3 * blockDim.x];	
-		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[4 * blockDim.x];	
-		  buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5)*shared[5 * blockDim.x];
-		}	
-	}
-	__syncthreads();
-}
-
-__global__ void MY_AP(PairVirialCompute_reduce)(int n)
-{
-	sharedmem[threadIdx.x] = ENERGY_F(0.0);
-	ENERGY_FLOAT sum = ENERGY_F(0.0);
-	ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
-	buf = &buf[blockIdx.x * n];
-	//if(blockIdx.x==2) buf=&buf[n];
-
-	for(int i = 0; i < n; i += blockDim.x)
-	{
-		sharedmem[threadIdx.x] = (i + threadIdx.x < n) ? buf[i + threadIdx.x] : ENERGY_F(0.0);
-		__syncthreads();
-		reduceBlock(sharedmem);
-		if(threadIdx.x == 0) sum += sharedmem[0];
-	}
-    if(threadIdx.x==0)
-    {
-	  if(gridDim.x == 1) //evdwl
-	  {
-	    _eng_vdwl[0]+=sum;
-	  }
-	  if(gridDim.x == 2) //evdwl + ecoul only
-	  {
-	    if(blockIdx.x==0) 
-	    _eng_vdwl[0]+=sum;
-	    else
-	    _eng_coul[0]+=sum;
-	  }
-	  if(gridDim.x == 6) //virial 
-	  {
-	    _virial[blockIdx.x] += sum;
-	  }
-	  if(gridDim.x == 7) //evdwl+virial
-	  {
-        if(blockIdx.x==0) 
-	    _eng_vdwl[0]+=sum;
-	    else _virial[blockIdx.x-1] += sum;
-	  }
-	  if(gridDim.x == 8) //evdwl+ecoul+virial
-	  {
-	    if(blockIdx.x==0) 
-	    _eng_vdwl[0]+=sum;
-	    else 
-	    if(blockIdx.x==1)
-	    _eng_coul[0]+=sum;
-	    else
-	    _virial[blockIdx.x-2] += sum;
-	  }
-	}
-}
--- a/lib/cuda/cuda_precision.h
+++ b/lib/cuda/cuda_precision.h
@ -1,284 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef CUDA_PRECISION_H_
-#define CUDA_PRECISION_H_
-/* This File gives Type definitions for mixed precision calculation in the cuda part of LAMMPS-CUDA.
- * Predefined behaviour is given by global CUDA_PRECISION (can be overwritten during compilation).
- * ***_FLOAT: type definition of given property
- * ***_F: constant extension in code (1.0 is interpreted as double while 1.0f is interpreted as float, now use: 1.0CUDA_F)
- */
-
-#ifdef CUDA_USE_BINNING
-#define CUDA_IF_BINNING(a) a
-#else
-#define CUDA_IF_BINNING(a) 
-#endif
-
-//GLOBAL
-
-#ifdef CUDA_PRECISION
-  #if CUDA_PRECISION == 1
-    #define CUDA_FLOAT float
-    #define CUDA_F(x) x##f
-  #endif 
-  #if CUDA_PRECISION == 2
-    #define CUDA_FLOAT double
-    #define CUDA_F(x) x
-  #endif
-#endif
- 
-#ifndef CUDA_PRECISION
-  #define CUDA_FLOAT double
-  #define CUDA_F(x) x
-  #define CUDA_PRECISION 2
-#endif
-//--------------------------------
-//-----------FFT-----------------
-//--------------------------------
-
-#ifdef FFT_PRECISION_CU
-  #if FFT_PRECISION_CU == 1
-    #define FFT_FLOAT float
-    #define FFT_F(x) x##f 
-  #endif 
-  #if FFT_PRECISION_CU == 2
-    #define FFT_FLOAT double
-    #define FFT_F(x) x 
-  #endif
-#endif
-
-#ifndef FFT_PRECISION_CU
-  #define FFT_FLOAT CUDA_FLOAT
-  #define FFT_F(x) CUDA_F(x)
-  #define FFT_PRECISION_CU CUDA_PRECISION 
-#endif
-
-//--------------------------------
-//-----------PPPM-----------------
-//--------------------------------
-
-#ifndef PPPM_PRECISION
-  #define PPPM_PRECISION CUDA_PRECISION 
-#endif
-
-#ifdef PPPM_PRECISION
-  #if PPPM_PRECISION == 1
-    #define PPPM_FLOAT float
-    #ifdef float3
-    #define PPPM_FLOAT3 float3
-    #else
-  struct PPPM_FLOAT3
-  {
-      PPPM_FLOAT x;
-      PPPM_FLOAT y;
-      PPPM_FLOAT z;
-  };
-    #endif
-    #define PPPM_F(x) x##f 
-  #endif 
-  #if PPPM_PRECISION == 2
-    #define PPPM_FLOAT double
-	struct PPPM_FLOAT3
-	{
-  	  PPPM_FLOAT x;
-  	  PPPM_FLOAT y;
-   	  PPPM_FLOAT z;
-	};
-    #define PPPM_F(x) x 
-  #endif
-#endif
-
-
-//--------------------------------
-//-----------FORCE-----------------
-//--------------------------------
-
-
-#ifdef F_PRECISION
-  #if F_PRECISION == 1
-    #define F_FLOAT float
-    #define F_F(x) x##f 
-  #endif 
-  #if F_PRECISION == 2
-    #define F_FLOAT double
-    #define F_F(x) x 
-  #endif
-#endif
-
-#ifndef F_PRECISION
-  #define F_FLOAT CUDA_FLOAT
-  #define F_F(x) CUDA_F(x) 
-  #define F_PRECISION CUDA_PRECISION 
-#endif
-
-#if F_PRECISION == 1
-#define _SQRT_ sqrtf
-#define _RSQRT_ rsqrtf
-#define _EXP_ expf
-#else
-#define _SQRT_ sqrt
-#define _RSQRT_ rsqrt
-#define _EXP_ exp
-#endif
-
-#if F_PRECISION == 2
-struct F_FLOAT2
-{
-  F_FLOAT x;
-  F_FLOAT y;
-};
-struct F_FLOAT3
-{
-  F_FLOAT x;
-  F_FLOAT y;
-  F_FLOAT z;
-};
-struct F_FLOAT4
-{
-  F_FLOAT x;
-  F_FLOAT y;
-  F_FLOAT z;
-  F_FLOAT w;
-};
-#else
-#define F_FLOAT2 float2
-#define F_FLOAT3 float3
-#define F_FLOAT4 float4
-#endif
-//--------------------------------
-//-----------ENERGY-----------------
-//--------------------------------
-
-#ifndef ENERGY_PRECISION
-  #define ENERGY_FLOAT CUDA_FLOAT
-  #define ENERGY_F(x) CUDA_F(x) 
-#endif
-
-#ifdef ENERGY_PRECISION
-  #if ENERGY_PRECISION == 1
-    #define ENERGY_FLOAT float
-    #define ENERGY_F(x) x##f 
-  #endif 
-  #if ENERGY_PRECISION == 2
-    #define ENERGY_FLOAT double
-    #define ENERGY_F(x) x 
-  #endif
-#endif
-
-#ifndef ENERGY_PRECISION
-  #define ENERGY_FLOAT CUDA_FLOAT
-  #define ENERGY_F(x) CUDA_F(x) 
-  #define ENERGY_PRECISION CUDA_PRECISION 
-#endif
-
-//--------------------------------
-//-----------POSITIONS------------
-//--------------------------------
-
-#ifdef X_PRECISION
-  #if X_PRECISION == 1
-    #define X_FLOAT float
-    #define X_F(x) x##f 
-  #endif 
-  #if X_PRECISION == 2
-    #define X_FLOAT double
-    #define X_F(x) x 
-  #endif
-#endif
-
-#ifndef X_PRECISION
-  #define X_FLOAT CUDA_FLOAT
-  #define X_F(x) CUDA_F(x) 
-  #define X_PRECISION CUDA_PRECISION 
-#endif
-
-#if X_PRECISION == 2
-struct X_FLOAT2
-{
-  X_FLOAT x;
-  X_FLOAT y;
-};
-struct X_FLOAT3
-{
-  X_FLOAT x;
-  X_FLOAT y;
-  X_FLOAT z;
-};
-struct X_FLOAT4
-{
-  X_FLOAT x;
-  X_FLOAT y;
-  X_FLOAT z;
-  X_FLOAT w;
-};
-#else
-#define X_FLOAT2 float2
-#define X_FLOAT3 float3
-#define X_FLOAT4 float4
-#endif
-
-//--------------------------------
-//-----------velocities-----------
-//--------------------------------
-
-#ifdef V_PRECISION
-  #if V_PRECISION == 1
-    #define V_FLOAT float
-    #define V_F(x) x##f  
-  #endif 
-  #if V_PRECISION == 2
-    #define V_FLOAT double
-    #define V_F(x) x  
-  #endif
-#endif
-
-#ifndef V_PRECISION
-  #define V_FLOAT CUDA_FLOAT
-  #define V_F(x) CUDA_F(x) 
-  #define V_PRECISION CUDA_PRECISION 
-#endif
-
-#if V_PRECISION == 2
-struct V_FLOAT4
-{
-  V_FLOAT x;
-  V_FLOAT y;
-  V_FLOAT z;
-  V_FLOAT w;
-};
-#else
-#define V_FLOAT4 float4
-#endif
-
-#ifdef NO_PREC_TIMING
-struct timespec_2
-{
-	unsigned int tv_sec;
-	unsigned int tv_nsec;
-};
-
-#define timespec timespec_2
-#define clock_gettime(a,b) 
-#endif
-#endif /*CUDA_PRECISION_H_*/
--- a/lib/cuda/cuda_shared.h
+++ b/lib/cuda/cuda_shared.h
@ -1,380 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef _CUDA_SHARED_H_
-#define _CUDA_SHARED_H_
-#include "cuda_precision.h"
-
-#define CUDA_MAX_DEBUG_SIZE 1000 //size of debugdata array (allows for so many doubles or twice as many int)
-
-struct dev_array
-{
-	void* dev_data;			// pointer to memory address on cuda device
-	unsigned dim[3];		// array dimensions
-};
-
-struct cuda_shared_atom		// relevent data from atom class
-{
-	dev_array dx; 			// cumulated distance for binning settings
-	dev_array x;			// position
-	dev_array v;			// velocity
-	dev_array f;			// force
-	dev_array tag;
-	dev_array type; 		// global ID number, there are ghosttype = ntypes  (ntypescuda=ntypes+1)   
-	dev_array mask;
-	dev_array image; 		
-	dev_array q;			// charges
-	dev_array mass;			// per-type masses
-	dev_array rmass;		// per-atom masses
-	dev_array radius;		// per-atom radius
-	dev_array density;
-	dev_array omega;
-	dev_array torque;
-	dev_array molecule;
-	
-	dev_array special;
-	int maxspecial;
-	dev_array nspecial;
-	int* special_flag;
-	int molecular;
-	
-	dev_array eatom;		// per-atom energy
-	dev_array vatom;		// per-atom virial
-	int need_eatom;
-	int need_vatom;
-	
-	dev_array x_type;		// position + type in X_FLOAT4 struct
-	dev_array v_radius;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
-	dev_array omega_rmass;		// velociyt + radius in V_FLOAT4 struct currently only used for granular atom_style
-
-	double* mass_host;		// remember per-type host pointer to masses
-	//int natoms;				// total # of atoms in system, could be 0
-	int nghost;				// and ghost atoms on this proc
-	int nlocal;				// # of owned
-	int nall;			    // total # of atoms in this proc
-	int nmax;				// max # of owned+ghost in arrays on this proc 	
-	int ntypes;
-	int q_flag;				// do we have charges?
-	int rmass_flag;			// do we have per-atom masses?
-	int firstgroup;
-	int nfirst;
-	
- 	int update_nlocal;
- 	int update_nmax;
- 	int update_neigh;
- 	
-	dev_array xhold;	    // position at last neighboring
- 	X_FLOAT triggerneighsq;		// maximum square movement before reneighboring
- 	int reneigh_flag;		// is reneighboring necessary
- 	int maxhold;			// size of xhold
- 	int dist_check; 		//perform distance check for reneighboring
- 	dev_array binned_id;    //id of each binned atom (not tag!!)
- 	dev_array binned_idnew; //new id of each binned atom for sorting basically setting atom[binned_id[k]] at atom[binned_newid[k]]
-    float bin_extraspace;	
-	int bin_dim[3];
-	int bin_nmax;
-	dev_array map_array;
-};
-
-struct cuda_shared_pair		// relevent data from pair class
-{
-	char cudable_force;		// check for (cudable_force!=0)
-	X_FLOAT cut_global;
-	X_FLOAT cut_inner_global;
-	X_FLOAT cut_coul_global;
-	double** cut;			// type-type cutoff 
-	double** cutsq;			// type-type cutoff 
-	double** cut_inner;			// type-type cutoff for coul
-	double** cut_coul;			// type-type cutoff for coul
-	double** coeff1;		// tpye-type pair parameters
-	double** coeff2;
-	double** coeff3;
-	double** coeff4;
-	double** coeff5;		
-	double** coeff6;		
-	double** coeff7;		
-	double** coeff8;		
-	double** coeff9;		
-	double** coeff10;		
-	double** offset;
-	double* special_lj;
-	double* special_coul;
-	dev_array virial; // ENERGY_FLOAT
-	dev_array eng_vdwl; // ENERGY_FLOAT
-	dev_array eng_coul; // ENERGY_FLOAT
-	X_FLOAT cut_coulsq_global;
-	F_FLOAT g_ewald,kappa;
-	int freeze_group_bit;
-	
-	dev_array coeff1_gm;
-	dev_array coeff2_gm;
-	dev_array coeff3_gm;
-	dev_array coeff4_gm;
-	dev_array coeff5_gm;
-	dev_array coeff6_gm;
-	dev_array coeff7_gm;
-	dev_array coeff8_gm;
-	dev_array coeff9_gm;
-	dev_array coeff10_gm;
-	
-	int lastgridsize;
-	int n_energy_virial;
-	int collect_forces_later;
-	int use_block_per_atom;
-	int override_block_per_atom;
-	bool neighall;
-	
-};
-
-struct cuda_shared_domain	// relevent data from domain class
-{
-	X_FLOAT sublo[3];			// orthogonal box -> sub-box bounds on this proc
-	X_FLOAT subhi[3];
-	X_FLOAT boxlo[3];
-	X_FLOAT boxhi[3];
-	X_FLOAT prd[3];
-	int periodicity[3];		// xyz periodicity as array
-
-	int triclinic;
-	X_FLOAT xy;
-	X_FLOAT xz;
-	X_FLOAT yz;
-	X_FLOAT boxlo_lamda[3];
-	X_FLOAT boxhi_lamda[3];
-	X_FLOAT prd_lamda[3];
-	X_FLOAT h[6];
-	X_FLOAT h_inv[6];
-	V_FLOAT h_rate[6];
-	int update;
-};
-
-struct cuda_shared_pppm
-{
-   char cudable_force;
-#ifdef FFT_CUFFT  
-   FFT_FLOAT* work1;
-   FFT_FLOAT* work2;
-   FFT_FLOAT* work3;
-   PPPM_FLOAT* greensfn;
-   PPPM_FLOAT* fkx;
-   PPPM_FLOAT* fky;
-   PPPM_FLOAT* fkz;
-   PPPM_FLOAT* vg;
-#endif
-   int* part2grid;
-   PPPM_FLOAT* density_brick;
-   int* density_brick_int;
-   PPPM_FLOAT density_intScale;
-   PPPM_FLOAT* vdx_brick;
-   PPPM_FLOAT* vdy_brick;
-   PPPM_FLOAT* vdz_brick;
-   PPPM_FLOAT* density_fft;
-   ENERGY_FLOAT* energy;
-   ENERGY_FLOAT* virial;
-   int nxlo_in;
-   int nxhi_in;
-   int nxlo_out;
-   int nxhi_out;
-   int nylo_in;
-   int nyhi_in;
-   int nylo_out;
-   int nyhi_out;
-   int nzlo_in;
-   int nzhi_in;
-   int nzlo_out;
-   int nzhi_out;
-   int nx_pppm;
-   int ny_pppm;
-   int nz_pppm;
-   PPPM_FLOAT qqrd2e;
-   int order;
-  // float3 sublo;
-   PPPM_FLOAT* rho_coeff;
-   int nmax;
-   int nlocal;
-   PPPM_FLOAT* debugdata;
-   PPPM_FLOAT delxinv;
-   PPPM_FLOAT delyinv;
-   PPPM_FLOAT delzinv;
-   int nlower;
-   int nupper;
-   PPPM_FLOAT shiftone;
-   PPPM_FLOAT3* fH;
-};
-
-struct cuda_shared_comm
-{
-   int maxswap;
-   int maxlistlength;
-   dev_array pbc;
-   dev_array slablo;
-   dev_array slabhi;
-   dev_array multilo;
-   dev_array multihi;
-   dev_array sendlist;
-   int grow_flag;
-   int comm_phase;
-   
-   int nsend;
-   int* nsend_swap;
-   int* send_size;
-   int* recv_size; 
-   double** buf_send;
-   void** buf_send_dev;
-   double** buf_recv;
-   void** buf_recv_dev;
-   void* buffer;
-   int buffer_size;
-   double overlap_split_ratio;
-};
-
-struct cuda_shared_neighlist // member of CudaNeighList, has no instance in cuda_shared_data
-{
-	int maxlocal;
-	int inum;                // # of I atoms neighbors are stored for local indices of I atoms
-	int inum_border2;
-	dev_array inum_border;         // # of atoms which interact with border atoms
-	dev_array ilist;
-	dev_array ilist_border;
-	dev_array numneigh;
-	dev_array numneigh_inner;
-	dev_array numneigh_border;
-	dev_array firstneigh;
-	dev_array neighbors;
-	dev_array neighbors_border;
-	dev_array neighbors_inner;
-	int maxpage;
-	dev_array page_pointers;
-	dev_array* pages;
-	int maxneighbors;
-	int neigh_lists_per_page;
-	double** cutneighsq;
-	CUDA_FLOAT* cu_cutneighsq;
-	int* binned_id;
-	int* bin_dim;
-	int bin_nmax;
-	float bin_extraspace;
-	double maxcut;
-	dev_array ex_type;
-	int nex_type;
-	dev_array ex1_bit;
-	dev_array ex2_bit;
-	int nex_group;
-	dev_array ex_mol_bit;
-	int nex_mol;
-	
-};
-
-struct cuda_compile_settings		// this is used to compare compile settings (i.e. precision) of the cu files, and the cpp files
-{
-    int prec_glob;
-    int prec_x;
-    int prec_v;
-    int prec_f;
-    int prec_pppm;
-    int prec_fft;
-    int cufft;
-    int arch;
-};
-
-struct cuda_timings_struct
-{
-	//Debug:
-	double test1;
-	double test2;
-	//transfers
-	double transfer_upload_tmp_constr;
-	double transfer_download_tmp_deconstr;
-	
-	//communication
-	double comm_forward_total;
-	double comm_forward_mpi_upper;
-	double comm_forward_mpi_lower;
-	double comm_forward_kernel_pack;
-	double comm_forward_kernel_unpack;
-	double comm_forward_kernel_self;
-	double comm_forward_upload;
-	double comm_forward_download;
-
-	double comm_exchange_total;
-	double comm_exchange_mpi;
-	double comm_exchange_kernel_pack;
-	double comm_exchange_kernel_unpack;
-	double comm_exchange_kernel_fill;
-	double comm_exchange_cpu_pack;
-	double comm_exchange_upload;
-	double comm_exchange_download;
-
-	double comm_border_total;
-	double comm_border_mpi;
-	double comm_border_kernel_pack;
-	double comm_border_kernel_unpack;
-	double comm_border_kernel_self;
-	double comm_border_kernel_buildlist;
-	double comm_border_upload;
-	double comm_border_download;
-	
-	//pair forces
-	double pair_xtype_conversion;
-	double pair_kernel;
-	double pair_virial;
-	double pair_force_collection;
-	
-	//neighbor
-	double neigh_bin;
-	double neigh_build;
-	double neigh_special;
-	
-	//PPPM
- 	double pppm_particle_map; 
-    double pppm_make_rho; 
-    double pppm_brick2fft; 
-    double pppm_poisson; 
-    double pppm_fillbrick; 
-    double pppm_fieldforce; 
-    double pppm_compute; 
-	
-};
-
-struct cuda_shared_data		// holds space for all relevent data from the different classes
-{
-	void* buffer; //holds temporary GPU data [data used in subroutines, which has not to be consistend outside of that routine]
-	int buffersize; //maxsize of buffer
-	int buffer_new; //should be 1 if the pointer to buffer has changed
-	void* flag;
-	void* debugdata;  //array for easily collecting debugdata from device class cuda contains the corresponding cu_debugdata and host array
-	cuda_shared_atom atom;
-	cuda_shared_pair pair;	
-	cuda_shared_domain domain;
-	cuda_shared_pppm pppm;
-	cuda_shared_comm comm;
-	cuda_compile_settings compile_settings;
-	cuda_timings_struct cuda_timings;
-	int exchange_dim;
-	int me; //mpi rank
-	unsigned int datamask;
-	int overlap_comm;
-};
-
-
-#endif // #ifndef _CUDA_SHARED_H_
--- a/lib/cuda/cuda_wrapper.cu
+++ b/lib/cuda/cuda_wrapper.cu
@ -1,317 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "cuda_wrapper_cu.h"
-#include "cuda_wrapper_kernel.cu"
-
-static int CudaWrapper_total_gpu_mem=0;
-static double CudaWrapper_total_upload_time=0;
-static double CudaWrapper_total_download_time=0;
-static double CudaWrapper_cpubuffer_upload_time=0;
-static double CudaWrapper_cpubuffer_download_time=0;
-static cudaStream_t* streams;
-static int nstreams=0;
-
-void CudaWrapper_Init(int argc, char** argv,int me,int ppn,int* devicelist)
-{
-	MYDBG( printf("# CUDA: debug mode on\n"); )
-	
-	#if __DEVICE_EMULATION__
-	
-	printf("# CUDA: emulation mode on\n");
-	
-	#else
-	
-	// modified from cutil.h
-    static int deviceCount=0;
-    static bool sharedmode=false;
-    if(deviceCount && !sharedmode) return;
-    if(deviceCount && sharedmode) cudaThreadExit();
-  
-    CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceCount(&deviceCount) );
-    if (deviceCount == 0)
-    {
-        fprintf(stderr, "cutil error: no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-    MYDBG( printf("# CUDA There are %i devices supporting CUDA in this system.\n",deviceCount);)
-    
-    cudaDeviceProp deviceProp[deviceCount];
-    for(int i=0;i<deviceCount;i++)
-    CUDA_SAFE_CALL_NO_SYNC( cudaGetDeviceProperties(&(deviceProp[i]), i) );
-
-  
-    int dev_list[deviceCount];
-    for(int i=0;i<deviceCount;i++) dev_list[i]=i;
-    for(int i=0;i<deviceCount;i++)
-    {
-      for(int j=0;j<deviceCount-1-i;j++)
-      if(deviceProp[dev_list[j]].multiProcessorCount<deviceProp[dev_list[j+1]].multiProcessorCount)
-      {
-      	int k=dev_list[j];
-      	dev_list[j]=dev_list[j+1];
-      	dev_list[j+1]=k;
-      }
-    }
-
-    for(int i=0;i<deviceCount;i++)
-    {
-      if((deviceProp[dev_list[i]].computeMode==0)) sharedmode=true;
-      cudaSetDevice(i);
-      cudaSetDeviceFlags(cudaDeviceMapHost);
-    }
-    if(sharedmode)
-    {
-      if(ppn&&(me%ppn+1)>deviceCount) {printf("Asking for more GPUs per node when there are. Reduce gpu/node setting.\n"); exit(0);}
-      int devicea=me%ppn;
-      if(devicelist) devicea=devicelist[devicea];
-      else
-      devicea=dev_list[devicea];
-      if(devicea>=deviceCount)  {printf("Asking for non existent GPU %i. Found only %i GPUs.\n",devicea,deviceCount); exit(0);}
-      MYDBG( 
-        printf(" # CUDA  myid: %i take device: %i\n",me,devicea);
-      )
-      CUDA_SAFE_CALL( cudaSetDevice(devicea) );
-    }
-    else
-    {
-      CUDA_SAFE_CALL( cudaSetValidDevices(dev_list,deviceCount) );
-    }
-    cudaThreadSynchronize();
-    
-    int dev;
-    CUDA_SAFE_CALL( cudaGetDevice(&dev));
-    
-    if (deviceProp[dev].major < 1)
-    {
-        fprintf(stderr, "CUDA error: device does not support CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-    else
-    if ((deviceProp[dev].major == 1)&&(deviceProp[dev].minor != 3))
-    {
-        fprintf(stderr, "CUDA error: You need a device with compute capability 1.3 or higher (Device %i is a %s with CC %i.%i)\n",dev,deviceProp[dev].name,deviceProp[dev].major,deviceProp[dev].minor);
-        exit(EXIT_FAILURE);
-    }
-    if ((deviceProp[dev].major == 2)&&(CUDA_ARCH<20))
-    {
-      fprintf(stderr, "CUDA warning: You are using a compute %i.%i or higher GPU while LAMMPScuda has been compiled for architecture 1.3\n",deviceProp[dev].major,deviceProp[dev].minor);
-    }
-    if ((deviceProp[dev].major == 1)&&(CUDA_ARCH>=20))
-    {
-      fprintf(stderr, "CUDA error: You are using a compute 1.3 GPU while LAMMPScuda has been compiled for architecture %i\n",CUDA_ARCH);
-      exit(EXIT_FAILURE);
-    }
-    
-    
-fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);
-    MYDBG( fprintf(stderr, "# Using device %d: %s\n", dev, deviceProp[dev].name);)
-
-	MYDBG
-	(
-		printf("name = %s\n", deviceProp[dev].name);
-		printf("totalGlobalMem = %u\n", deviceProp[dev].totalGlobalMem);
-		printf("sharedMemPerBlock = %i\n", deviceProp[dev].sharedMemPerBlock);
-		printf("regsPerBlock = %i\n", deviceProp[dev].regsPerBlock);
-		printf("warpSize = %i\n", deviceProp[dev].warpSize);
-		printf("memPitch = %i\n", deviceProp[dev].memPitch);
-		printf("maxThreadsPerBlock = %i\n", deviceProp[dev].maxThreadsPerBlock);
-		printf("maxThreadsDim = [%i, %i, %i]\n", deviceProp[dev].maxThreadsDim[0], deviceProp[dev].maxThreadsDim[1], deviceProp[dev].maxThreadsDim[2]);
-		printf("maxGridSize = [%i, %i, %i]\n", deviceProp[dev].maxGridSize[0], deviceProp[dev].maxGridSize[1], deviceProp[dev].maxGridSize[2]);
-		printf("totalConstMem = %i\n", deviceProp[dev].totalConstMem);
-		printf("major . minor = %i . %i\n", deviceProp[dev].major, deviceProp[dev].minor);
-		printf("clockRate = %i\n", deviceProp[dev].clockRate);
-		printf("textureAlignment = %i\n", deviceProp[dev].textureAlignment);
-		printf("deviceOverlap = %i\n", deviceProp[dev].deviceOverlap);
-		printf("multiProcessorCount = %i\n", deviceProp[dev].multiProcessorCount);
-		printf("computeMode = %i\n", deviceProp[dev].computeMode);
-	)
-	
-	#endif
- }
-
-void* CudaWrapper_AllocCudaData(unsigned nbytes)
-{
-	void* dev_data;
-	CUDA_SAFE_CALL( cudaMalloc((void**)&dev_data, nbytes) );
-	MYDBG( printf("# CUDA: allocated %u bytes on device at dev%p\n", nbytes, dev_data); )
-	CudaWrapper_total_gpu_mem+=nbytes;
-	return dev_data;
-}
-
-void CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes)
-{
-	MYDBG( printf("# CUDA: uploading %u bytes to device at dev%p from %p\n", nbytes, dev_data,host_data); )
-	cudaThreadSynchronize();
-    timespec time1,time2;
-    clock_gettime(CLOCK_REALTIME,&time1);
-	CUDA_SAFE_CALL( cudaMemcpy(dev_data, host_data, nbytes, cudaMemcpyHostToDevice) );
-    clock_gettime(CLOCK_REALTIME,&time2);
-    CudaWrapper_total_upload_time+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-}
-
-void CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream)
-{
-	MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
-	cudaMemcpyAsync(dev_data, host_data, nbytes, cudaMemcpyHostToDevice,streams[stream]);
-}
-
-void CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes)
-{
-	MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
-	cudaThreadSynchronize();
-    timespec time1,time2;
-    clock_gettime(CLOCK_REALTIME,&time1);
-	CUDA_SAFE_CALL( cudaMemcpy(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost) );
-    clock_gettime(CLOCK_REALTIME,&time2);
-    CudaWrapper_total_download_time+=
-        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
-}
-
-void CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes,int stream)
-{
-	MYDBG( printf("# CUDA: downloading %u bytes from device at dev%p\n", nbytes, dev_data); )
-	cudaMemcpyAsync(host_data, dev_data, nbytes, cudaMemcpyDeviceToHost,streams[stream]);
-}
-
-void CudaWrapper_FreeCudaData(void* dev_data,unsigned nbytes)
-{
-	MYDBG( printf("# CUDA: freeing memory at dev%p with %i bytes (last adress: %p)\n", dev_data,nbytes,(char*)dev_data+nbytes); )
-	CUDA_SAFE_CALL( cudaFree(dev_data) );
-	CudaWrapper_total_gpu_mem-=nbytes;
-}
-
-void CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes)
-{
-	MYDBG( printf("# CUDA: setting %u bytes to %i at dev%p\n", nbytes, value, dev_data); )
-	CUDA_SAFE_CALL( cudaMemset(dev_data, value, nbytes) );
-}
-
-void CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes)
-{
-	MYDBG( printf("# CUDA: copy %u bytes from dev%p to dev%p\n", nbytes, dev_source,dev_dest); )
-	CUDA_SAFE_CALL( cudaMemcpy(dev_dest, dev_source, nbytes, cudaMemcpyDeviceToDevice) );
-}
-
-void* CudaWrapper_AllocPinnedHostData(unsigned nbytes,bool mapped,bool writeCombined)
-{
-	void* host_data;
-	int flags=0;
-	if(mapped) flags=flags | cudaHostAllocMapped;
-	if(writeCombined) flags=flags | cudaHostAllocWriteCombined;
-	
-	CUDA_SAFE_CALL( cudaHostAlloc((void**)&host_data, nbytes,flags) );
-//	CUDA_SAFE_CALL( cudaMallocHost((void**)&host_data, nbytes) );
-	MYDBG( printf("# CUDA: allocated %u bytes pinned memory on host at %p\n", nbytes, host_data); )
-	return host_data;
-}
-
-void CudaWrapper_FreePinnedHostData(void* host_data)
-{
-	MYDBG( printf("# CUDA: freeing pinned host memory at %p \n",host_data); )
-	if(host_data)
-	CUDA_SAFE_CALL( cudaFreeHost(host_data) );
-}
-
-void cuda_check_error(char* comment)
-{
-  printf("ERROR-CUDA %s %s\n",comment,cudaGetErrorString(cudaGetLastError()));
-}
-
-int CudaWrapper_CheckMemUseage()
-{
-	size_t free,total;
-	cudaMemGetInfo(&free,&total);
-	return total-free; //possible with cuda 3.0 ???
-	//return CudaWrapper_total_gpu_mem;
-}
-
-double CudaWrapper_CheckUploadTime(bool reset)
-{
-	if(reset) CudaWrapper_total_upload_time=0.0;
-	return CudaWrapper_total_upload_time;
-}
-
-double CudaWrapper_CheckDownloadTime(bool reset)
-{
-	if(reset) CudaWrapper_total_download_time=0.0;
-	return CudaWrapper_total_download_time;
-}
-
-double CudaWrapper_CheckCPUBufUploadTime(bool reset)
-{
-	if(reset) CudaWrapper_cpubuffer_upload_time=0.0;
-	return CudaWrapper_cpubuffer_upload_time;
-}
-
-double CudaWrapper_CheckCPUBufDownloadTime(bool reset)
-{
-	if(reset) CudaWrapper_cpubuffer_download_time=0.0;
-	return CudaWrapper_cpubuffer_download_time;
-}
-
-void CudaWrapper_AddCPUBufUploadTime(double dt)
-{
-	CudaWrapper_cpubuffer_upload_time+=dt;
-}
-
-void CudaWrapper_AddCPUBufDownloadTime(double dt)
-{
-	CudaWrapper_cpubuffer_download_time+=dt;
-}
-
-void CudaWrapper_Sync()
-{
-	cudaThreadSynchronize();
-}
-
-void CudaWrapper_SyncStream(int stream)
-{
-	cudaStreamSynchronize(streams[stream]);
-}
-
-void CudaWrapper_AddStreams(int n)
-{
-	cudaStream_t* new_streams=new cudaStream_t[nstreams+n];
-	for(int i=0;i<nstreams;i++) new_streams[i]=streams[i];
-	for(int i=nstreams;i<nstreams+n;i++) cudaStreamCreate(&new_streams[i]);
-	if(nstreams>0)
-	delete [] streams;
-	streams=new_streams;
-	nstreams+=n;
-}
-
-void* CudaWrapper_returnStreams()
-{
-    return (void*) streams;
-}
-
-int CudaWrapper_returnNStreams()
-{
-    return nstreams;
-}
-
--- a/lib/cuda/cuda_wrapper_cu.h
+++ b/lib/cuda/cuda_wrapper_cu.h
@ -1,52 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef _CUDA_DATA_WRAPPER_H_
-#define _CUDA_DATA_WRAPPER_H_
-
-extern "C" void  CudaWrapper_Init(int argc, char** argv,int me=0,int ppn=2,int* devicelist=NULL);
-extern "C" void* CudaWrapper_AllocCudaData(unsigned nbytes);
-extern "C" void  CudaWrapper_UploadCudaData(void* host_data, void* dev_data, unsigned nbytes);
-extern "C" void  CudaWrapper_UploadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
-extern "C" void  CudaWrapper_DownloadCudaData(void* host_data, void* dev_data, unsigned nbytes);
-extern "C" void  CudaWrapper_DownloadCudaDataAsync(void* host_data, void* dev_data, unsigned nbytes, int stream_id);
-extern "C" void  CudaWrapper_FreeCudaData(void* dev_data, unsigned nbytes=0);
-extern "C" void  CudaWrapper_Memset(void* dev_data, int value, unsigned nbytes);
-extern "C" void  CudaWrapper_CopyData(void* dev_dest, void* dev_source, unsigned nbytes);
-extern "C" void* CudaWrapper_AllocPinnedHostData(unsigned nbytes, bool mapped=false, bool writeCombind=false);
-extern "C" void  CudaWrapper_FreePinnedHostData(void* dev_data);
-extern "C" void  cuda_check_error(char* comment);
-extern "C" int   CudaWrapper_CheckMemUseage();
-extern "C" double CudaWrapper_CheckUploadTime(bool reset=false);
-extern "C" double CudaWrapper_CheckDownloadTime(bool reset=false);
-extern "C" double CudaWrapper_CheckCPUBufUploadTime(bool reset=false);
-extern "C" double CudaWrapper_CheckCPUBufDownloadTime(bool reset=false);
-extern "C" void CudaWrapper_AddCPUBufUploadTime(double dt);
-extern "C" void CudaWrapper_AddCPUBufDownloadTime(double dt);
-extern "C" void CudaWrapper_Sync();
-extern "C" void CudaWrapper_SyncStream(int n);
-extern "C" void CudaWrapper_AddStreams(int n);
-extern "C" void* CudaWrapper_returnStreams();
-extern "C" int CudaWrapper_returnNStreams();
-
-#endif // _CUDA_DATA_WRAPPER_H_
--- a/lib/cuda/cuda_wrapper_kernel.cu
+++ b/lib/cuda/cuda_wrapper_kernel.cu
@ -1,24 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-// empty file to obay common make rule
--- a/lib/cuda/domain.cu
+++ b/lib/cuda/domain.cu
@ -1,194 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX domain
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "domain_cu.h"
-#include "domain_kernel.cu"
-
-void Cuda_Domain_UpdateBuffer(cuda_shared_data* sdata,int size)
-{
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_Domain Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_Domain_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(tag)    , & sdata->atom.tag .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(image)   , & sdata->atom.image.dev_data, sizeof(int*) );
-}
-
-void Cuda_Domain_UpdateDomain(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(boxlo)   ,  sdata->domain.boxlo       , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(boxhi)   ,  sdata->domain.boxhi       , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(sublo)   ,  sdata->domain.sublo       , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(subhi)   ,  sdata->domain.subhi       , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(prd)     ,  sdata->domain.prd         , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(periodicity)   ,   sdata->domain.periodicity , 3*sizeof(int));
-		cudaMemcpyToSymbol(MY_CONST(triclinic)     , & sdata->domain.triclinic   , sizeof(int) );
-		cudaMemcpyToSymbol(MY_CONST(boxlo_lamda)   ,   sdata->domain.boxlo_lamda , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(boxhi_lamda)   ,   sdata->domain.boxhi_lamda , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(prd_lamda)	   ,   sdata->domain.prd_lamda   , 3*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(h)	   	 ,   sdata->domain.h   		  , 6*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(h_inv)	 ,   sdata->domain.h_inv   	  , 6*sizeof(X_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(h_rate)	 ,   sdata->domain.h_rate     , 6*sizeof(V_FLOAT));
-		cudaMemcpyToSymbol(MY_CONST(flag)	 ,   &sdata->flag     , sizeof(int*));
-		cudaMemcpyToSymbol(MY_CONST(debugdata)	 ,   &sdata->debugdata     , sizeof(int*));
-}
-
-void Cuda_Domain_Init(cuda_shared_data* sdata)
-{
-		Cuda_Domain_UpdateNmax(sdata);
-		Cuda_Domain_UpdateDomain(sdata);
-}
-
-void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent)
-{
-		Cuda_Domain_UpdateNmax(sdata);
-	//if(sdata->domain.update)
-		Cuda_Domain_UpdateDomain(sdata);
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-
-	int box_change=0;
-	if(extent) box_change=1;
-
-	int sharedmem=0;
-	if(box_change) sharedmem=6*sizeof(X_FLOAT);
-
-	int3 layout=getgrid(sdata->atom.nlocal,sharedmem);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	sharedmem*=threads.x;
-	
-	if((box_change)&&(sdata->buffer_new or (6*sizeof(X_FLOAT)*grid.x*grid.y>sdata->buffersize)))
-		Cuda_Domain_UpdateBuffer(sdata,layout.x*layout.y*6*sizeof(X_FLOAT));
-	
-	
-	Domain_PBC_Kernel<<<grid, threads,sharedmem>>>(deform_remap,deform_groupbit,box_change);
-	cudaThreadSynchronize();
-	  
-	CUT_CHECK_ERROR("Cuda_Domain_PBC: Kernel execution failed");
-	if(box_change)
-	{
-	  X_FLOAT buf2[6*layout.x*layout.y];
-	  X_FLOAT* buf=buf2;
-	  int flag;
-	  cudaMemcpy(buf, sdata->buffer, 6*layout.x*layout.y*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-	  cudaMemcpy(&flag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-	  //printf("Flag: %i\n",flag);
-	  X_FLOAT min,max;
-	  min=1.0*BIG;
-	  max=-1.0*BIG;
-	  for(int i=0;i<layout.x*layout.y;i++)
-	  {
-	  	if(buf[i]<min) min=buf[i];
-	  	if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
-	  }
-	  extent[0]=min;
-	  extent[1]=max;
-
-	  buf+=2*layout.x*layout.y;
-	  min=1.0*BIG;
-	  max=-1.0*BIG;
-	  for(int i=0;i<layout.x*layout.y;i++)
-	  {
-	  	if(buf[i]<min) min=buf[i];
-	  	if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
-	  }
-	  extent[2]=min;
-	  extent[3]=max;
-
-	  buf+=2*layout.x*layout.y;
-	  min=1.0*BIG;
-	  max=-1.0*BIG;
-	  for(int i=0;i<layout.x*layout.y;i++)
-	  {
-	  	if(buf[i]<min) min=buf[i];
-	  	if(buf[i+layout.x*layout.y]>max) max=buf[i+layout.x*layout.y];
-	  }
-	  extent[4]=min;
-	  extent[5]=max;
-		//printf("Extent: %lf %lf %lf %lf %lf %lf\n",extent[0],extent[1],extent[2],extent[3],extent[4],extent[5]);
-/*	   int n=grid.x*grid.y;
-	   if(n<128) threads.x=32;
-	   else if(n<256) threads.x=64;
-	   else threads.x=128;
-	   sharedmem=n*sizeof(X_FLOAT);
-	   grid.x=6;
-	   grid.y=1;
-	   Domain_reduceBoxExtent<<<grid, threads,sharedmem>>>(extent,n);
-	   cudaThreadSynchronize();
-	   CUT_CHECK_ERROR("Cuda_Domain_reduceBoxExtent: Kernel execution failed");*/
-	}
-}
-
-void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n)
-{
-		Cuda_Domain_UpdateNmax(sdata);
-	//if(sdata->domain.update)
-		Cuda_Domain_UpdateDomain(sdata);
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-
-	Domain_lamda2x_Kernel<<<grid, threads,0>>>(n);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Domain_lamda2x: Kernel execution failed");
-}
-
-void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n)
-{
-		Cuda_Domain_UpdateNmax(sdata);
-	//if(sdata->domain.update)
-		Cuda_Domain_UpdateDomain(sdata);
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Domain_x2lamda_Kernel<<<grid, threads,0>>>(n);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Domain_x2lamda: Kernel execution failed");
-}
--- a/lib/cuda/domain_cu.h
+++ b/lib/cuda/domain_cu.h
@ -1,29 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_Domain_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_Domain_PBC(cuda_shared_data* sdata,int deform_remap,int deform_groupbit,double* extent=NULL);
-extern "C" void Cuda_Domain_lamda2x(cuda_shared_data* sdata,int n);
-extern "C" void Cuda_Domain_x2lamda(cuda_shared_data* sdata,int n);
--- a/lib/cuda/domain_kernel.cu
+++ b/lib/cuda/domain_kernel.cu
@ -1,269 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ X_FLOAT sharedmem[];
-
-#define BIG 1e10
-__global__ void Domain_PBC_Kernel(int deform_remap,int deform_groupbit,int box_change)
-{
-  int idim,otherdims;
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-
-  X_FLOAT lo[3];
-  X_FLOAT hi[3];
-  X_FLOAT* period;
-
-  if (_triclinic == 0) {
-    lo[0] = _boxlo[0];
-    lo[1] = _boxlo[1];
-    lo[2] = _boxlo[2];
-    
-    hi[0] = _boxhi[0];
-    hi[1] = _boxhi[1];
-    hi[2] = _boxhi[2];
-    period = _prd;
-  } else {
-    lo[0] = _boxlo_lamda[0];
-    lo[1] = _boxlo_lamda[1];
-    lo[2] = _boxlo_lamda[2];
-    
-    hi[0] = _boxhi_lamda[0];
-    hi[1] = _boxhi_lamda[1];
-    hi[2] = _boxhi_lamda[2];
-    period = _prd_lamda;
-  }
-
-  
-  X_FLOAT tmpx=X_F(0.5)*(hi[0]+lo[0]);
-  X_FLOAT tmpy=X_F(0.5)*(hi[1]+lo[1]);
-  X_FLOAT tmpz=X_F(0.5)*(hi[2]+lo[2]);
-  
-  X_FLOAT* buf=(X_FLOAT*) _buffer;
-  buf+=blockIdx.x*gridDim.y+blockIdx.y;
-  buf[0]=tmpx;
-  buf+=gridDim.x*gridDim.y;
-  buf[0]=tmpx;
-  buf+=gridDim.x*gridDim.y;
-  buf[0]=tmpy;
-  buf+=gridDim.x*gridDim.y;
-  buf[0]=tmpy;
-  buf+=gridDim.x*gridDim.y;
-  buf[0]=tmpz;
-  buf+=gridDim.x*gridDim.y;
-  buf[0]=tmpz;
-
-  if(i<_nlocal)
-  {
-
-    if (_periodicity[0]) {
-      if (_x[i] < lo[0]) {
-	_x[i] += period[0];
-	if (deform_remap && _mask[i] & deform_groupbit) _v[i] += _h_rate[0];
-	idim = _image[i] & 1023;
-        otherdims = _image[i] ^ idim;
-	idim--;
-	idim &= 1023;
-	_image[i] = otherdims | idim;
-      }
-      if (_x[i] >= hi[0]) {
-	_x[i] -= period[0];
-	_x[i] = MAX(_x[i],lo[0]);
-	if (deform_remap && _mask[i] & deform_groupbit) _v[i] -= _h_rate[0];
-	idim = _image[i] & 1023;
-	otherdims = _image[i] ^ idim;
-	idim++;
-	idim &= 1023;
-	_image[i] = otherdims | idim;
-      }
-    }
-
-    if (_periodicity[1]) {
-      if (_x[i+_nmax] < lo[1]) {
-	_x[i+_nmax] += period[1];
-	if (deform_remap && _mask[i] & deform_groupbit) {
-	  _v[i] += _h_rate[5];
-	  _v[i+_nmax] += _h_rate[1];
-	}
-	idim = (_image[i] >> 10) & 1023;
-        otherdims = _image[i] ^ (idim << 10);
-	idim--;
-	idim &= 1023;
-	_image[i] = otherdims | (idim << 10);
-      }
-      if (_x[i+_nmax] >= hi[1]) {
-	_x[i+_nmax] -= period[1];
-	_x[i+_nmax] = MAX(_x[i+_nmax],lo[1]);
-	if (deform_remap && _mask[i] & deform_groupbit) {
-	  _v[i] -= _h_rate[5];
-	  _v[i+_nmax] -= _h_rate[1];
-	}
-	idim = (_image[i] >> 10) & 1023;
-        otherdims = _image[i] ^ (idim << 10);
-	idim++;
-	idim &= 1023;
-	_image[i] = otherdims | (idim << 10);
-      }
-    }
-
-    if (_periodicity[2]) {
-      if (_x[i+2*_nmax] < lo[2]) {
-	_x[i+2*_nmax] += period[2];
-	if (deform_remap && _mask[i] & deform_groupbit) {
-	  _v[i] += _h_rate[4];
-	  _v[i+_nmax] += _h_rate[3];
-	  _v[i+2*_nmax] += _h_rate[2];
-	}
-	idim = _image[i] >> 20;
-        otherdims = _image[i] ^ (idim << 20);
-	idim--;
-	idim &= 1023;
-	_image[i] = otherdims | (idim << 20);
-      }
-      if (_x[i+2*_nmax] >= hi[2]) {
-	_x[i+2*_nmax] -= period[2];
-	_x[i+2*_nmax] = MAX(_x[i+2*_nmax],lo[2]);
-	if (deform_remap && _mask[i] & deform_groupbit) {
-	  _v[i] -= _h_rate[4];
-	  _v[i+_nmax] -= _h_rate[3];
-	  _v[i+2*_nmax] -= _h_rate[2];
-	}
-	idim = _image[i] >> 20;
-        otherdims = _image[i] ^ (idim << 20);
-	idim++;
-	idim &= 1023;
-	_image[i] = otherdims | (idim << 20);
-      }
-    }
-    if(box_change)
-    {
-    	tmpx=_x[i];
-    	tmpy=_x[i+_nmax];
-    	tmpz=_x[i+2*_nmax];
-    	
-    	
-    }
-  }
-  __syncthreads();
-  if(box_change)
-  {
-  	X_FLOAT minx=BIG;
-  	X_FLOAT maxx=-BIG;
-  	X_FLOAT miny=BIG;
-  	X_FLOAT maxy=-BIG;
-  	X_FLOAT minz=BIG;
-  	X_FLOAT maxz=-BIG;
-  
-  if (not _periodicity[0]) {	
-  	sharedmem[threadIdx.x]=tmpx;
-  	minOfBlock(sharedmem);
-  	minx=sharedmem[0];
-  	__syncthreads();
-  	sharedmem[threadIdx.x]=tmpx;
-  	maxOfBlock(sharedmem);
-  	maxx=sharedmem[0];
-  	__syncthreads();
-  }
-  else {minx=lo[0];maxx=hi[0];}
-  if (not _periodicity[1]) {
-  	sharedmem[threadIdx.x]=tmpy;
-  	minOfBlock(sharedmem);
-  	miny=sharedmem[0];
-  	__syncthreads();
-  	sharedmem[threadIdx.x]=tmpy;
-  	maxOfBlock(sharedmem);
-  	maxy=sharedmem[0];
-  	__syncthreads();
-  }
-  else {minx=lo[1];maxx=hi[1];}
-  if (not _periodicity[2]) {	
-  	sharedmem[threadIdx.x]=tmpz;
-  	minOfBlock(sharedmem);
-  	minz=sharedmem[0];
-  	__syncthreads();
-  	sharedmem[threadIdx.x]=tmpz;
-  	maxOfBlock(sharedmem);
-  	maxz=sharedmem[0];
-  	__syncthreads();
-  }
-  else {minz=lo[2];maxz=hi[2];}
-  	if(threadIdx.x==0)
-  	{
-  	  buf=(X_FLOAT*) _buffer;
-  	  buf+=blockIdx.x*gridDim.y+blockIdx.y;
-  	  buf[0]=minx;
-  	  buf+=gridDim.x*gridDim.y;
-  	  buf[0]=maxx;
-  	  buf+=gridDim.x*gridDim.y;
-  	  buf[0]=miny;
-  	  buf+=gridDim.x*gridDim.y;
-  	  buf[0]=maxy;
-  	  buf+=gridDim.x*gridDim.y;
-  	  buf[0]=minz;
-  	  buf+=gridDim.x*gridDim.y;
-  	  buf[0]=maxz;
-  	}
-  }  
-}
-
-__global__ void Domain_reduceBoxExtent(double* extent,int n)
-{
-  X_FLOAT* buf=(X_FLOAT*) _buffer; 
-  buf+=blockIdx.x*n;
-  copyGlobToShared(buf,sharedmem,n);
-  if(blockIdx.x%2==0)
-  minOfData(sharedmem,n); 
-  else 
-  maxOfData(sharedmem,n);
-  extent[blockIdx.x]=sharedmem[0];
-}
-
-__global__ void Domain_lamda2x_Kernel(int n)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<n)
-  { 
-    X_FLOAT ytmp = _x[i+_nmax];
-    X_FLOAT ztmp = _x[i+2*_nmax];
-    _x[i] = _h[0]*_x[i] + _h[5]*ytmp + _h[4]*ztmp + _boxlo[0];
-    _x[i+_nmax] = _h[1]*ytmp + _h[3]*ztmp + _boxlo[1];
-    _x[i+2*_nmax] = _h[2]*ztmp + _boxlo[2];
-  }
-}
-
-__global__ void Domain_x2lamda_Kernel(int n)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-
-  X_FLOAT delta[3];
-
-  if(i<n)
-  { 
-    delta[0] = _x[i] - _boxlo[0];
-    delta[1] = _x[i+_nmax] - _boxlo[1];
-    delta[2] = _x[i+2*_nmax] - _boxlo[2];
-
-    _x[i] = _h_inv[0]*delta[0] + _h_inv[5]*delta[1] + _h_inv[4]*delta[2];
-    _x[i+_nmax] = _h_inv[1]*delta[1] + _h_inv[3]*delta[2];
-    _x[i+2*_nmax] = _h_inv[2]*delta[2];
-  }
-}
--- a/lib/cuda/fft3d_cuda.cu
+++ b/lib/cuda/fft3d_cuda.cu
@ -1,103 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-//#define CUDA_PRECISION 1
-#include "cuda_precision.h"
-#include "cuda_common.h"
-struct  FFT_DATA{
-  FFT_FLOAT re;
-  FFT_FLOAT im;
-};
-
-#include "fft3d_cuda_cu.h"
-#include "fft3d_cuda_kernel.cu"
-#include <stdio.h>
-
-void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow)
-{
-	
-  dim3 grid;
-  grid.x=nslow;
-  grid.y=nmid;
-  grid.z=1;
-  dim3 threads;
-  threads.x=nfast;
-  threads.y=1;
-  threads.z=1;
-  cudaThreadSynchronize();
-  initfftdata_kernel<<<grid,threads,0>>>(in,out);
-  cudaThreadSynchronize();
-  MYDBG(printf("ERROR-CUDA initfftdata_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
-}
-
-
-void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
-{
-	
-  dim3 grid;
-  grid.x=nslow;
-  grid.y=nmid;
-  grid.z=1;
-  dim3 threads;
-  threads.x=nfast*2;
-  threads.y=1;
-  threads.z=1;
-  permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
-  cudaThreadSynchronize();
-  MYDBG(printf("ERROR-CUDA permute_kernel: %s\n",cudaGetErrorString(cudaGetLastError())));
-}
-
-void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow)
-{
-	
-  dim3 grid;
-  grid.x=nslow;
-  grid.y=nmid;
-  grid.z=1;
-  dim3 threads;
-  threads.x=nfast*2;
-  threads.y=1;
-  threads.z=1;
-  permute_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out);
-  cudaThreadSynchronize();
-}
-void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)
-{
-	
-  dim3 grid;
-  grid.x=(ihi-ilo+1);
-  grid.y=(jhi-jlo+1);
-  grid.z=1;
-  dim3 threads;
-  threads.x=(khi-klo+1)*2;
-  threads.y=1;
-  threads.z=1;
-  permute_part_kernel<<<grid,threads,0>>>((FFT_FLOAT*)in,(FFT_FLOAT*)out,nfast,nmid,nslow,ihi,ilo,jhi,jlo,khi,klo);
-  cudaThreadSynchronize();
- }
- 
- void FFTsyncthreads()
- {
- 	cudaThreadSynchronize();
- }
- 
--- a/lib/cuda/fft3d_cuda_cu.h
+++ b/lib/cuda/fft3d_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void initfftdata(double* in,FFT_FLOAT* out,int nfast,int nmid,int nslow);
-extern "C" void permute(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow);
-extern "C" void permute_scale(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow);
-extern "C" void permute_part(FFT_DATA* in,FFT_DATA* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo);
-extern "C" void FFTsyncthreads();
--- a/lib/cuda/fft3d_cuda_kernel.cu
+++ b/lib/cuda/fft3d_cuda_kernel.cu
@ -1,44 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void initfftdata_kernel(double* in,FFT_FLOAT* out)
-{
-   out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x];
-   out[2*(((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x)+1]=0;
-}
-
-
-__global__ void permute_kernel(FFT_FLOAT* in,FFT_FLOAT* out)
-{
-   out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x];
-}
-
-__global__ void permute_scale_kernel(FFT_FLOAT* in,FFT_FLOAT* out)
-{
-   out[2*(((threadIdx.x/2)*gridDim.x+blockIdx.x)*gridDim.y+blockIdx.y)+threadIdx.x-2*(threadIdx.x/2)]=in[((blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x)+threadIdx.x]*gridDim.x*gridDim.y*blockDim.x*0.5;
-}
-
-__global__ void permute_part_kernel(FFT_FLOAT* in,FFT_FLOAT* out,int nfast,int nmid,int nslow,int ihi,int ilo,int jhi,int jlo,int khi,int klo)
-{
-    {out[2*((threadIdx.x/2)*(ihi-ilo+1)*(jhi-jlo+1)+(blockIdx.x)*(jhi-jlo+1)+blockIdx.y-jlo)+threadIdx.x-2*(threadIdx.x/2)]=in[2*(blockIdx.x+ilo)*nmid*nslow+2*(blockIdx.y+jlo)*nmid+threadIdx.x+2*klo];  }
-}
--- a/lib/cuda/fix_addforce_cuda.cu
+++ b/lib/cuda/fix_addforce_cuda.cu
@ -1,89 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_add_force_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "fix_addforce_cuda_cu.h"
-#include "fix_addforce_cuda_kernel.cu"
-
-void Cuda_FixAddForceCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-        int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
-	    dim3 threads(layout.z, 1, 1);
-	    dim3 grid(layout.x, layout.y, 1);		
-	    int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixAddForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_FixAddForceCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-}
-
-void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixAddForceCuda_UpdateNmax(sdata);
-}
-
-void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixAddForceCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	if(sdata->buffer_new)
-		Cuda_FixAddForceCuda_UpdateBuffer(sdata);
-	int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixAddForceCuda_PostForce_Kernel<<<grid, threads,threads.x*4*sizeof(F_FLOAT)>>> (groupbit,axvalue,ayvalue,azvalue);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Compute Kernel execution failed");
-  
-	int oldgrid=grid.x;
-	grid.x=4;
-	threads.x=512;
-    reduce_foriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,aforiginal);
-    cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixAddForceCuda_PostForce: fix add_force post_force Reduce Kernel execution failed");
-
-}
--- a/lib/cuda/fix_addforce_cuda_cu.h
+++ b/lib/cuda/fix_addforce_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixAddForceCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixAddForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue,F_FLOAT* aforiginal);
--- a/lib/cuda/fix_addforce_cuda_kernel.cu
+++ b/lib/cuda/fix_addforce_cuda_kernel.cu
@ -1,86 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_FLOAT sharedmem[];
-
-
-__global__ void Cuda_FixAddForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   
-  sharedmem[threadIdx.x]=0;
-  sharedmem[threadIdx.x+blockDim.x]=0;
-  sharedmem[threadIdx.x+2*blockDim.x]=0;
-  sharedmem[threadIdx.x+3*blockDim.x]=0;
-   
-  if(i < _nlocal)
-  if (_mask[i] & groupbit) 
-    //if (iregion >= 0 && 
-        //match(x[i][0],x[i][1],x[i][2],iregion)) //currently not supported
-  {
-     sharedmem[threadIdx.x]=-xvalue*_x[i] - yvalue*_x[i+1*_nmax] - zvalue*_x[i+2*_nmax];
-     sharedmem[threadIdx.x+blockDim.x]=_f[i];
-     sharedmem[threadIdx.x+2*blockDim.x]=_f[i+1*_nmax];
-     sharedmem[threadIdx.x+3*blockDim.x]=_f[i+2*_nmax];
-     _f[i] += xvalue;
-     _f[i+1*_nmax] += yvalue;
-     _f[i+2*_nmax] += zvalue;
-  }
-  
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2*blockDim.x]);
-  reduceBlock(&sharedmem[3*blockDim.x]);
-  F_FLOAT* buffer=(F_FLOAT*) _buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];	
-  }
-  
-}
-
-
-__global__ void reduce_foriginal(int n,F_FLOAT* foriginal)
-{
-	int i=0;
-    sharedmem[threadIdx.x]=0;
-    F_FLOAT myforig=0.0;
-    F_FLOAT* buf=(F_FLOAT*) _buffer;
-    buf=&buf[blockIdx.x*n];
-	while(i<n)
-	{
-      sharedmem[threadIdx.x]=0;
-	  if(i+threadIdx.x<n)
-      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
-      __syncthreads();
-	  reduceBlock(sharedmem);
-      i+=blockDim.x;
-      if(threadIdx.x==0)
-      myforig+=sharedmem[0];
-	}
-	if(threadIdx.x==0)
-	foriginal[blockIdx.x]=myforig;
-}
--- a/lib/cuda/fix_aveforce_cuda.cu
+++ b/lib/cuda/fix_aveforce_cuda.cu
@ -1,104 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_ave_force_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-
-#include "crm_cuda_utils.cu"
-
-#include "fix_aveforce_cuda_cu.h"
-#include "fix_aveforce_cuda_kernel.cu"
-
-void Cuda_FixAveForceCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-        int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
-	    dim3 threads(layout.z, 1, 1);
-	    dim3 grid(layout.x, layout.y, 1);		
-	    int size=(unsigned)(layout.z*layout.y*layout.x)*4*sizeof(F_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixAveForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_FixAveForceCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-}
-
-void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixAveForceCuda_UpdateNmax(sdata);
-}
-
-void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixAveForceCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	if(sdata->buffer_new)
-		Cuda_FixAveForceCuda_UpdateBuffer(sdata);
-
-	int3 layout=getgrid(sdata->atom.nlocal,4*sizeof(F_FLOAT));
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	
-	Cuda_FixAveForceCuda_PostForce_FOrg_Kernel<<<grid, threads,threads.x*4*sizeof(F_FLOAT)>>> (groupbit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Compute Kernel execution failed");
-  
-	int oldgrid=grid.x;
-	grid.x=4;
-	threads.x=512;
-    Cuda_FixAveForceCuda_reduce_foriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,aforiginal);
-    cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce: fix ave_force post_force Reduce Kernel execution failed");
-
-}
-
-void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue)
-{
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	
-	Cuda_FixAveForceCuda_PostForce_Set_Kernel<<<grid, threads,0>>> (groupbit,xflag,yflag,zflag,axvalue,ayvalue,azvalue);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixAveForceCuda_PostForce_Set: fix ave_force post_force Compute Kernel execution failed");
-  
-}
--- a/lib/cuda/fix_aveforce_cuda_cu.h
+++ b/lib/cuda/fix_aveforce_cuda_cu.h
@ -1,28 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixAveForceCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixAveForceCuda_PostForce_FOrg(cuda_shared_data* sdata, int groupbit,F_FLOAT* aforiginal);
-extern "C" void Cuda_FixAveForceCuda_PostForce_Set(cuda_shared_data* sdata, int groupbit,int xflag,int yflag,int zflag,F_FLOAT axvalue,F_FLOAT ayvalue,F_FLOAT azvalue);
--- a/lib/cuda/fix_aveforce_cuda_kernel.cu
+++ b/lib/cuda/fix_aveforce_cuda_kernel.cu
@ -1,87 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_FLOAT sharedmem[];
-
-
-__global__ void Cuda_FixAveForceCuda_PostForce_FOrg_Kernel(int groupbit)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-      sharedmem[threadIdx.x]=0;
-      sharedmem[threadIdx.x+blockDim.x]=0;
-      sharedmem[threadIdx.x+2*blockDim.x]=0;
-      sharedmem[threadIdx.x+3*blockDim.x]=0;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-      sharedmem[threadIdx.x]=_f[i];
-      sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
-      sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
-      sharedmem[threadIdx.x+3*blockDim.x]=1;
-    }
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2*blockDim.x]);
-  reduceBlock(&sharedmem[3*blockDim.x]);
-  F_FLOAT* buffer=(F_FLOAT*) _buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+3*gridDim.x*gridDim.y]=sharedmem[3*blockDim.x];	
-  }
-}
-
-
-__global__ void Cuda_FixAveForceCuda_reduce_foriginal(int n,F_FLOAT* foriginal)
-{
-	int i=0;
-    sharedmem[threadIdx.x]=0;
-    F_FLOAT myforig=0.0;
-    F_FLOAT* buf=(F_FLOAT*) _buffer;
-    buf=&buf[blockIdx.x*n];
-	while(i<n)
-	{
-      sharedmem[threadIdx.x]=0;
-	  if(i+threadIdx.x<n)
-      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
-      __syncthreads();
-	  reduceBlock(sharedmem);
-      i+=blockDim.x;
-      if(threadIdx.x==0)
-      myforig+=sharedmem[0];
-	}
-	if(threadIdx.x==0)
-	foriginal[blockIdx.x]=myforig;
-}
-
-__global__ void Cuda_FixAveForceCuda_PostForce_Set_Kernel(int groupbit,int xflag, int yflag, int zflag,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-      if(xflag) _f[i] = xvalue;
-      if(yflag) _f[i+1*_nmax] = yvalue;
-      if(zflag) _f[i+2*_nmax] = zvalue;
-    }
-}
--- a/lib/cuda/fix_enforce2d_cuda.cu
+++ b/lib/cuda/fix_enforce2d_cuda.cu
@ -1,54 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_enforce2d_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_enforce2d_cuda_cu.h"
-#include "fix_enforce2d_cuda_kernel.cu"
-
-void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata)
-{
-	cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-	cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-	cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-}
-
-void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixEnforce2dCuda_Init(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	Cuda_FixEnforce2dCuda_PostForce_Kernel<<<grid, threads>>> (groupbit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixEnforce2dCuda_PostForce: fix enforce2d post_force Kernel execution failed");
-}
--- a/lib/cuda/fix_enforce2d_cuda_cu.h
+++ b/lib/cuda/fix_enforce2d_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixEnforce2dCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixEnforce2dCuda_PostForce(cuda_shared_data* sdata, int groupbit);
--- a/lib/cuda/fix_enforce2d_cuda_kernel.cu
+++ b/lib/cuda/fix_enforce2d_cuda_kernel.cu
@ -1,33 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-__global__ void Cuda_FixEnforce2dCuda_PostForce_Kernel(int groupbit)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-      _v[i+2*_nmax] = V_F(0.0);
-      _f[i+2*_nmax] = F_F(0.0);
-    }	
-}
--- a/lib/cuda/fix_freeze_cuda.cu
+++ b/lib/cuda/fix_freeze_cuda.cu
@ -1,95 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_freeze_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_freeze_cuda_cu.h"
-#include "fix_freeze_cuda_kernel.cu"
-
-void Cuda_FixFreezeCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-        int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
-	    dim3 threads(layout.z, 1, 1);
-	    dim3 grid(layout.x, layout.y, 1);		
-	    int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixFreezeCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-			
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_FixFreezeCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(torque)  , & sdata->atom.torque .dev_data, sizeof(F_FLOAT*) );
-}
-
-
-void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixFreezeCuda_UpdateNmax(sdata);
-		
-}
-
-
-void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixFreezeCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	if(sdata->buffer_new)
-		Cuda_FixFreezeCuda_UpdateBuffer(sdata);
-	
-	
-	int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixFreezeCuda_PostForce_Kernel<<<grid, threads,threads.x*3*sizeof(F_FLOAT)>>> (groupbit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-  
-	int oldgrid=grid.x;
-	grid.x=3;
-	threads.x=512;
-    Cuda_FixFreezeCuda_Reduce_FOriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,foriginal);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixFreezeCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
-
-}
--- a/lib/cuda/fix_freeze_cuda_cu.h
+++ b/lib/cuda/fix_freeze_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixFreezeCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixFreezeCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT* foriginal);
--- a/lib/cuda/fix_freeze_cuda_kernel.cu
+++ b/lib/cuda/fix_freeze_cuda_kernel.cu
@ -1,82 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_FLOAT sharedmem[];
-
-
-__global__ void Cuda_FixFreezeCuda_PostForce_Kernel(int groupbit)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-      sharedmem[threadIdx.x]=0;
-      sharedmem[threadIdx.x+blockDim.x]=0;
-      sharedmem[threadIdx.x+2*blockDim.x]=0;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-      sharedmem[threadIdx.x]=_f[i];
-      sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
-      sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
-
-      _f[i] = F_F(0.0);
-      _f[i+1*_nmax] = F_F(0.0);
-      _f[i+2*_nmax] = F_F(0.0);
-      _torque[i] = F_F(0.0);
-      _torque[i+1*_nmax] = F_F(0.0);
-      _torque[i+2*_nmax] = F_F(0.0);
-    }
- 
-    
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2*blockDim.x]);
-  F_FLOAT* buffer=(F_FLOAT*)_buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
-  }
-}
-
-
-__global__ void Cuda_FixFreezeCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal)
-{
-	int i=0;
-    sharedmem[threadIdx.x]=0;
-    F_FLOAT myforig=0.0;
-    F_FLOAT* buf=(F_FLOAT*)_buffer;
-    buf=&buf[blockIdx.x*n];
-	while(i<n)
-	{
-      sharedmem[threadIdx.x]=0;
-	  if(i+threadIdx.x<n)
-      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
-      __syncthreads();
-	  reduceBlock(sharedmem);
-      i+=blockDim.x;
-      if(threadIdx.x==0)
-      myforig+=sharedmem[0];
-	}
-	if(threadIdx.x==0)
-	foriginal[blockIdx.x]=myforig;
-}
-
--- a/lib/cuda/fix_gravity_cuda.cu
+++ b/lib/cuda/fix_gravity_cuda.cu
@ -1,89 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_gravity_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_gravity_cuda_cu.h"
-#include "fix_gravity_cuda_kernel.cu"
-
-void Cuda_FixGravityCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-        int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
-	    dim3 threads(layout.z, 1, 1);
-	    dim3 grid(layout.x, layout.y, 1);		
-	    int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixGravityCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-			
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_FixGravityCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)       , & sdata->atom.type    .dev_data, sizeof(int*) );
-		cudaMemcpyToSymbol(MY_CONST(rmass_flag)       , & sdata->atom.rmass_flag, sizeof(int) );
-		cudaMemcpyToSymbol(MY_CONST(rmass)       , & sdata->atom.rmass    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mass)       , & sdata->atom.mass    .dev_data, sizeof(V_FLOAT*) );
-}
-
-void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixGravityCuda_UpdateNmax(sdata);
-		
-}
-
-
-void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixGravityCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	if(sdata->buffer_new)
-		Cuda_FixGravityCuda_UpdateBuffer(sdata);
-	
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixGravityCuda_PostForce_Kernel<<<grid, threads>>> (groupbit,xacc,yacc,zacc);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixGravityCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_gravity_cuda_cu.h
+++ b/lib/cuda/fix_gravity_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixGravityCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixGravityCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc);
--- a/lib/cuda/fix_gravity_cuda_kernel.cu
+++ b/lib/cuda/fix_gravity_cuda_kernel.cu
@ -1,36 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void Cuda_FixGravityCuda_PostForce_Kernel(int groupbit,F_FLOAT xacc,F_FLOAT yacc,F_FLOAT zacc)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-	  F_FLOAT mass = _rmass_flag?_rmass[i]:_mass[_type[i]];
-      _f[i] += mass*xacc;
-      _f[i+1*_nmax] += mass*yacc;
-      _f[i+2*_nmax] += mass*zacc;
-    }
-}
-
--- a/lib/cuda/fix_nh_cuda.cu
+++ b/lib/cuda/fix_nh_cuda.cu
@ -1,220 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_nh_cuda
-#define IncludeCommonNeigh
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_nh_cuda_cu.h"
-#include "fix_nh_cuda_kernel.cu"
-
-void Cuda_FixNHCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(debugdata)     , & sdata->debugdata, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass.dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata
-		cudaMemcpyToSymbol(MY_CONST(maxhold)   , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata
-		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
-		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
-}
-
-void Cuda_FixNHCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-		int size=(unsigned)10*sizeof(int);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixNHCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-			
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
-}
-
-void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
-{
-		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(dtf)     , & dtf                       		, sizeof(V_FLOAT)  );
-		cudaMemcpyToSymbol(MY_CONST(dtv)     , & dtv                            , sizeof(X_FLOAT)  );
-		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)  );
-		cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check       , sizeof(int)	   );
-		cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int)      ); //
-		Cuda_FixNHCuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-      timespec atime1,atime2;
-	  clock_gettime(CLOCK_REALTIME,&atime1);
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNHCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	  clock_gettime(CLOCK_REALTIME,&atime2);
-	  sdata->cuda_timings.test1+=
-        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-	if(sdata->buffer_new)
-		Cuda_FixNHCuda_UpdateBuffer(sdata);
-	F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
-	F_FLOAT3 factor2;
-	if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
-	
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	FixNHCuda_nh_v_press_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
-
-}
-
-void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNHCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	if(sdata->buffer_new)
-		Cuda_FixNHCuda_UpdateBuffer(sdata);
-	F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
-	F_FLOAT3 factor2;
-	if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
-	
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	CUT_CHECK_ERROR("FixNHCuda: fix nh v_press pre Kernel execution failed");
-	FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("FixNHCuda: fix nh v_press Kernel execution failed");
-
-}
-
-void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-      timespec atime1,atime2;
-	  clock_gettime(CLOCK_REALTIME,&atime1);
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNHCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	  clock_gettime(CLOCK_REALTIME,&atime2);
-	  sdata->cuda_timings.test1+=
-        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-	if(sdata->buffer_new)
-		Cuda_FixNHCuda_UpdateBuffer(sdata);
-	
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	FixNHCuda_nh_v_temp_Kernel<<<grid, threads>>> (groupbit,factor_eta);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("FixNHCuda: fix nh v_temp Kernel execution failed");
-
-}
-void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-      timespec atime1,atime2;
-	  clock_gettime(CLOCK_REALTIME,&atime1);
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNHCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	  clock_gettime(CLOCK_REALTIME,&atime2);
-	  sdata->cuda_timings.test1+=
-        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-	if(sdata->buffer_new)
-		Cuda_FixNHCuda_UpdateBuffer(sdata);
-	
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	FixNHCuda_nve_v_Kernel<<<grid, threads>>> (groupbit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("FixNHCuda: nve_v Kernel execution failed");
-}
-
-
-void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-      timespec atime1,atime2;
-	  clock_gettime(CLOCK_REALTIME,&atime1);
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNHCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	  clock_gettime(CLOCK_REALTIME,&atime2);
-	  sdata->cuda_timings.test1+=
-        atime2.tv_sec-atime1.tv_sec+1.0*(atime2.tv_nsec-atime1.tv_nsec)/1000000000;
-	if(sdata->buffer_new)
-		Cuda_FixNHCuda_UpdateBuffer(sdata);
-	
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-    cudaMemset(sdata->buffer,0,sizeof(int));
-	FixNHCuda_nve_x_Kernel<<<grid, threads>>> (groupbit);
-	cudaThreadSynchronize();
-	int reneigh_flag;
-	cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost);
-	sdata->atom.reneigh_flag+=reneigh_flag;
-	CUT_CHECK_ERROR("FixNHCuda: nve_x Kernel execution failed");
-}
-
-void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNHCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	if(sdata->buffer_new)
-		Cuda_FixNHCuda_UpdateBuffer(sdata);
-	
-	F_FLOAT3 factor = {factor_h[0],factor_h[1],factor_h[2]};
-	F_FLOAT3 factor2;
-	if(p_triclinic) {factor2.x=factor_h[3],factor2.y=factor_h[4];factor2.z=factor_h[5];}
-	
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel<<<grid, threads>>> (groupbit,factor,p_triclinic,factor2);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("FixNHCuda__nve_v_and_nh_v_press_NoBias:   Kernel execution failed");
-}
-
--- a/lib/cuda/fix_nh_cuda_cu.h
+++ b/lib/cuda/fix_nh_cuda_cu.h
@ -1,32 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixNHCuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
-extern "C" void Cuda_FixNHCuda_nh_v_press(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nh_v_temp(cuda_shared_data* sdata, int groupbit, F_FLOAT factor_eta,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nh_v_press_and_nve_v_NoBias(cuda_shared_data* sdata, int groupbit, double* factor_h,int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nve_v(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nve_x(cuda_shared_data* sdata, int groupbit,int mynlocal);//mynlocal can be nfirst if firstgroup==igroup  see cpp
-extern "C" void Cuda_FixNHCuda_nve_v_and_nh_v_press_NoBias(cuda_shared_data* sdata,int groupbit, double* factor_h, int mynlocal,int p_triclinic);//mynlocal can be nfirst if firstgroup==igroup  see cpp
--- a/lib/cuda/fix_nh_cuda_kernel.cu
+++ b/lib/cuda/fix_nh_cuda_kernel.cu
@ -1,187 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit)
-{
-  if(_dist_check)
-  {
-  	
-  	X_FLOAT d=X_F(0.0);
-  	if(i<_nlocal)
-  	{
-  	  X_FLOAT tmp=xtmp-_xhold[i];
-  	  d=tmp*tmp;
-  	  tmp=ytmp-_xhold[i+_maxhold];
-  	  d+=tmp*tmp;
-  	  tmp=ztmp-_xhold[i+2*_maxhold];
-  	  d+=tmp*tmp;
-  
-  	  d=((_mask[i] & groupbit))?d:X_F(0.0);
-  	}
-  	if(not __all(d<=_triggerneighsq)) 
-  		_reneigh_flag[0]=1;
-  }
-}
-
-__global__ void FixNHCuda_nh_v_press_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
-{
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		V_FLOAT* my_v = _v + i;
-		V_FLOAT vx=my_v[0];
-		V_FLOAT vy=my_v[_nmax];
-		V_FLOAT vz=my_v[2*_nmax];
-		vx*=factor.x;
-		vy*=factor.y;
-		vz*=factor.z;
-		if(p_triclinic) {
-			vx += vy*factor2.z + vz*factor2.y;
-			vy += vz*factor2.x;
-		}
-		vx*=factor.x;
-		vy*=factor.y;
-		vz*=factor.z;
-		my_v[0]       = vx;  
-		my_v[_nmax]   = vy;  
-		my_v[2*_nmax] = vz; 
-	}
-	
-}
-
-__global__ void FixNHCuda_nh_v_temp_Kernel(int groupbit, F_FLOAT factor_eta)
-{
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		V_FLOAT* my_v = _v + i;
-		my_v[0]*=factor_eta;
-		my_v[_nmax]*=factor_eta;
-		my_v[2*_nmax]*=factor_eta;
-	}
-	
-}
-
-__global__ void FixNHCuda_nh_v_press_and_nve_v_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
-{
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		F_FLOAT* my_f = _f + i;
-		V_FLOAT* my_v = _v + i;
-		
-		V_FLOAT 		dtfm = _dtf;
-		if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i];
-		else 			dtfm*= V_F(1.0) / _mass[_type[i]];
-		
-		V_FLOAT vx=my_v[0];
-		V_FLOAT vy=my_v[_nmax];
-		V_FLOAT vz=my_v[2*_nmax];
-		vx*=factor.x;
-		vy*=factor.y;
-		vz*=factor.z;
-		if(p_triclinic) {
-			vx += vy*factor2.z + vz*factor2.y;
-			vy += vz*factor2.x;
-		}
-		vx*=factor.x;
-		vy*=factor.y;
-		vz*=factor.z;
-		my_v[0]       = vx + dtfm * my_f[0];  
-		my_v[_nmax]   = vy + dtfm * my_f[_nmax];  
-		my_v[2*_nmax] = vz + dtfm * my_f[_nmax*2]; 
-	}
-	
-}
-
-__global__ void FixNHCuda_nve_v_Kernel(int groupbit)
-{
-	
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		F_FLOAT* my_f = _f + i;
-		V_FLOAT* my_v = _v + i;
-
-		V_FLOAT 		dtfm = _dtf;
-		if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
-		else 			dtfm*=V_F(1.0) / _mass[_type[i]];
-
-		*my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax;
-		*my_v = (*my_v + dtfm*(*my_f)); my_f += _nmax; my_v += _nmax;
-		*my_v = (*my_v + dtfm*(*my_f));
-	}
-}
-
-__global__ void FixNHCuda_nve_x_Kernel(int groupbit)
-{
-	X_FLOAT xtmp,ytmp,ztmp;
-		
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		V_FLOAT* my_v = _v + i;
-		X_FLOAT* my_x = _x + i;
-		
-		xtmp = *my_x += _dtv * *my_v;  my_v += _nmax; my_x += _nmax;
-		ytmp = *my_x += _dtv * *my_v;  my_v += _nmax; my_x += _nmax;
-		ztmp = *my_x += _dtv * *my_v;
-	}
-	check_distance(xtmp,ytmp,ztmp,i,groupbit);
-}
-
-
-__global__ void FixNHCuda_nve_v_and_nh_v_press_NoBias_Kernel(int groupbit, F_FLOAT3 factor,int p_triclinic,F_FLOAT3 factor2)
-{
-	
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		F_FLOAT* my_f = _f + i;
-		V_FLOAT* my_v = _v + i;
-
-		V_FLOAT 		dtfm = _dtf;
-		if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
-		else 			dtfm*=V_F(1.0) / _mass[_type[i]];
-
-		V_FLOAT vx = my_v[0] + dtfm*my_f[0];
-		V_FLOAT vy = my_v[_nmax] + dtfm*my_f[_nmax];
-		V_FLOAT vz = my_v[2*_nmax] + dtfm*my_f[2*_nmax];
-		
-		vx*=factor.x;
-		vy*=factor.y;
-		vz*=factor.z;
-		if(p_triclinic) {
-			vx += vy*factor2.z + vz*factor2.y;
-			vy += vz*factor2.x;
-		}		
-		vx*=factor.x;
-		vy*=factor.y;
-		vz*=factor.z;
-		my_v[0]       = vx;  
-		my_v[_nmax]   = vy;  
-		my_v[2*_nmax] = vz; 
-		
-	}
-}
-
--- a/lib/cuda/fix_nve_cuda.cu
+++ b/lib/cuda/fix_nve_cuda.cu
@ -1,161 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_nve_cuda
-#define IncludeCommonNeigh
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_nve_cuda_cu.h"
-#include "fix_nve_cuda_kernel.cu"
-
-void Cuda_FixNVECuda_UpdateNmax(cuda_shared_data* sdata)
-{
-	#ifdef CUDA_USE_BINNING
-
-		
-		cudaMemcpyToSymbol(MY_CONST(bin_count_all)  , & sdata->atom.bin_count_all  .dev_data, sizeof(unsigned*));
-		cudaMemcpyToSymbol(MY_CONST(bin_count_local), & sdata->atom.bin_count_local.dev_data, sizeof(unsigned*));
-		cudaMemcpyToSymbol(MY_CONST(bin_dim)        , sdata->domain.bin_dim                 , sizeof(int)*3    );
-		cudaMemcpyToSymbol(MY_CONST(binned_f)       , & sdata->atom.binned_f       .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_type)    , & sdata->atom.binned_type    .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(binned_v)       , & sdata->atom.binned_v       .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_x)       , & sdata->atom.binned_x       .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(binned_rmass)   , & sdata->atom.binned_rmass   .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mask)           , & sdata->atom.mask           .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(rmass)          , & sdata->atom.rmass          .dev_data, sizeof(V_FLOAT*) );
-		
-	}
-	
-	#else
-	
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(xhold)   , & sdata->atom.xhold.dev_data, sizeof(X_FLOAT*) ); //might be moved to a neighbor record in sdata
-		cudaMemcpyToSymbol(MY_CONST(maxhold)   , & sdata->atom.maxhold, sizeof(int) ); //might be moved to a neighbor record in sdata
-		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
-		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), & sdata->atom.triggerneighsq, sizeof(X_FLOAT)); //might be moved to a neighbor record in sdata
-	
-	#endif
-}
-
-void Cuda_FixNVECuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-		int size=(unsigned)10*sizeof(int);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixNVECuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-			
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(reneigh_flag), & sdata->buffer, sizeof(int*) ); //might be moved to a neighbor record in sdata
-}
-
-void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf)
-{
-		cudaMemcpyToSymbol(MY_CONST(mass)    , & sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(dtf)     , & dtf                       		, sizeof(V_FLOAT)  );
-		cudaMemcpyToSymbol(MY_CONST(dtv)     , & dtv                            , sizeof(X_FLOAT)  );
-		cudaMemcpyToSymbol(MY_CONST(triggerneighsq), &sdata->atom.triggerneighsq, sizeof(X_FLOAT)  );
-		cudaMemcpyToSymbol(MY_CONST(dist_check), & sdata->atom.dist_check       , sizeof(int)	   );
-		cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int)      ); //
-		Cuda_FixNVECuda_UpdateNmax(sdata);
-}
-
-
-void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNVECuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	if(sdata->buffer_new)
-		Cuda_FixNVECuda_UpdateBuffer(sdata);
-
-	#ifdef CUDA_USE_BINNING
-	
-	dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
-	dim3 threads(sdata->domain.bin_nmax, 1, 1);
-	FixNVECuda_InitialIntegrate_N_Kernel<<<grid, threads>>> (groupbit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate (binning) Kernel execution failed");
-	
-	#else
-		
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-    cudaMemset(sdata->buffer,0,sizeof(int));
- 	FixNVECuda_InitialIntegrate_Kernel<<<grid, threads>>> (groupbit);
-	cudaThreadSynchronize();
-	int reneigh_flag;
-	cudaMemcpy((void*) (&reneigh_flag), sdata->buffer, sizeof(int),cudaMemcpyDeviceToHost);
-	sdata->atom.reneigh_flag+=reneigh_flag;
-	CUT_CHECK_ERROR("Cuda_FixNVECuda_InitialIntegrate_N: fix nve initial integrate Kernel execution failed");
-	
-	#endif
-	
-}
-
-void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal)//mynlocal can be nfirst if firstgroup==igroup  see cpp
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixNVECuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal) , & sdata->atom.nlocal , sizeof(int));
-	if(sdata->buffer_new)
-		Cuda_FixNVECuda_UpdateBuffer(sdata);
-
-	#ifdef CUDA_USE_BINNING
-	
-	dim3 grid(sdata->domain.bin_dim[0], sdata->domain.bin_dim[1] * sdata->domain.bin_neighbors[2], 1);
-	dim3 threads(sdata->domain.bin_nmax, 1, 1);
-	FixNVECuda_FinalIntegrate_Kernel<<<grid, threads>>> (groupbit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate (binning) Kernel execution failed");
-	
-	#else
-		
-	int3 layout=getgrid(mynlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	FixNVECuda_FinalIntegrate_Kernel<<<grid, threads>>> (groupbit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_FixNVECuda_FinalIntegrate: fix nve final integrate Kernel execution failed");
-	
-	#endif
-}
-
--- a/lib/cuda/fix_nve_cuda_cu.h
+++ b/lib/cuda/fix_nve_cuda_cu.h
@ -1,28 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixNVECuda_Init(cuda_shared_data* sdata, X_FLOAT dtv, V_FLOAT dtf);
-extern "C" void Cuda_FixNVECuda_InitialIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
-extern "C" void Cuda_FixNVECuda_FinalIntegrate(cuda_shared_data* sdata, int groupbit, int mynlocal);
--- a/lib/cuda/fix_nve_cuda_kernel.cu
+++ b/lib/cuda/fix_nve_cuda_kernel.cu
@ -1,137 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-static inline __device__ void check_distance(X_FLOAT &xtmp,X_FLOAT &ytmp,X_FLOAT &ztmp,int &i,int groupbit)
-{
-  if(_dist_check)
-  {
-  	X_FLOAT tmp=xtmp-_xhold[i];
-  	X_FLOAT d=tmp*tmp;
-  	tmp=ytmp-_xhold[i+_maxhold];
-  	d+=tmp*tmp;
-  	tmp=ztmp-_xhold[i+2*_maxhold];
-  	d+=tmp*tmp;
-  
-  	d=((i < _nlocal) && (_mask[i] & groupbit))?d:X_F(0.0);
-  
-  	if(not __all(d<=_triggerneighsq)) 
-  		_reneigh_flag[0]=1;
-  }
-}
-
-
-__global__ void FixNVECuda_InitialIntegrate_Kernel(int groupbit)
-{
-	X_FLOAT xtmp,ytmp,ztmp;
-	#ifdef CUDA_USE_BINNING
-	
-	const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
-	if(threadIdx.x < _bin_count_local[bin])
-	{
-		const int i = 3*blockDim.x * bin + threadIdx.x;
-		if(_mask[i] & groupbit)
-		{
-			F_FLOAT* my_f = _binned_f + i;
-			V_FLOAT* my_v = _binned_v + i;
-			X_FLOAT* my_x = _binned_x + i;
-
-			V_FLOAT 		dtfm = _dtf
-			if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i];
-			else 			dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
-
-			V_FLOAT v_mem;
-			v_mem = *my_v += dtfm * (*my_f); xtmp = *my_x += _dtv * v_mem;  my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x;
-			v_mem = *my_v += dtfm * (*my_f); ytmp = *my_x += _dtv * v_mem;  my_f += blockDim.x; my_v += blockDim.x; my_x += blockDim.x;
-			v_mem = *my_v += dtfm * (*my_f); ztmp = *my_x += _dtv * v_mem;
-		}
-	}
-	
-	#else
-	
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		F_FLOAT* my_f = _f + i;
-		V_FLOAT* my_v = _v + i;
-		X_FLOAT* my_x = _x + i;
-				
-		V_FLOAT 		dtfm = _dtf;
-		if(_rmass_flag) dtfm*= V_F(1.0) / _rmass[i];
-		else 			dtfm*= V_F(1.0) / _mass[_type[i]];
-		
-		V_FLOAT v_mem;
-		v_mem = *my_v += dtfm * (*my_f); xtmp=*my_x += _dtv * v_mem;  my_f += _nmax; my_v += _nmax; my_x += _nmax;
-		v_mem = *my_v += dtfm * (*my_f); ytmp=*my_x += _dtv * v_mem;  my_f += _nmax; my_v += _nmax; my_x += _nmax;
-		v_mem = *my_v += dtfm * (*my_f); ztmp=*my_x += _dtv * v_mem;
-	}
-	
-	#endif
-
-	check_distance(xtmp,ytmp,ztmp,i,groupbit);
-}
-
-__global__ void FixNVECuda_FinalIntegrate_Kernel(int groupbit)
-{
-	#ifdef CUDA_USE_BINNING
-	
-	const unsigned bin = gridDim.y * blockIdx.x + blockIdx.y;
-	if(threadIdx.x < _bin_count_local[bin])
-	{
-		const int i = 3*blockDim.x * bin + threadIdx.x;
-		if(_mask[i] & groupbit)
-		{
-			F_FLOAT* my_f = _binned_f + i;
-			V_FLOAT* my_v = _binned_v + i;
-
-			V_FLOAT 		dtfm = _dtf
-			if(_rmass_flag) dtfm*= V_F(1.0) / _binned_rmass[i];
-			else 			dtfm*= V_F(1.0) / _mass[_binned_type[blockDim.x * bin + threadIdx.x]];
-
-			*my_v += dtfm * (*my_f);  my_f += blockDim.x; my_v += blockDim.x;
-			*my_v += dtfm * (*my_f);  my_f += blockDim.x; my_v += blockDim.x;
-			*my_v += dtfm * (*my_f);
-		}
-	}
-	
-	#else
-	
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i < _nlocal && _mask[i] & groupbit)
-	{
-		F_FLOAT* my_f = _f + i;
-		V_FLOAT* my_v = _v + i;
-
-		V_FLOAT 		dtfm = _dtf;
-		if(_rmass_flag) dtfm*=V_F(1.0) / _rmass[i];
-		else 			dtfm*=V_F(1.0) / _mass[_type[i]];
-
-		*my_v += dtfm * (*my_f);  my_f += _nmax; my_v += _nmax;
-		*my_v += dtfm * (*my_f);  my_f += _nmax; my_v += _nmax;
-		*my_v += dtfm * (*my_f);
-	}
-	
-	#endif
-}
-
-
-
--- a/lib/cuda/fix_set_force_cuda.cu
+++ b/lib/cuda/fix_set_force_cuda.cu
@ -1,93 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_set_force_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_set_force_cuda_cu.h"
-#include "fix_set_force_cuda_kernel.cu"
-
-void Cuda_FixSetForceCuda_UpdateBuffer(cuda_shared_data* sdata)
-{
-        int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
-	    dim3 threads(layout.z, 1, 1);
-	    dim3 grid(layout.x, layout.y, 1);		
-	    int size=(unsigned)(layout.z*layout.y*layout.x)*3*sizeof(F_FLOAT);
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixSetForceCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-			
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_FixSetForceCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-}
-
-void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixSetForceCuda_UpdateNmax(sdata);
-		
-}
-
-
-void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixSetForceCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	if(sdata->buffer_new)
-		Cuda_FixSetForceCuda_UpdateBuffer(sdata);
-	
-	
-	int3 layout=getgrid(sdata->atom.nlocal,3*sizeof(F_FLOAT));
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixSetForceCuda_PostForce_Kernel<<<grid, threads,threads.x*3*sizeof(F_FLOAT)>>> (groupbit,xvalue,yvalue,zvalue,flagx,flagy,flagz);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-  
-	int oldgrid=grid.x;
-	grid.x=3;
-	threads.x=512;
-    Cuda_FixSetForceCuda_Reduce_FOriginal<<<grid, threads,threads.x*sizeof(F_FLOAT)>>> (oldgrid,foriginal);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixSetForceCuda_PostForce: fix add_force post_force reduce Kernel execution failed");
-
-}
--- a/lib/cuda/fix_set_force_cuda_cu.h
+++ b/lib/cuda/fix_set_force_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixSetForceCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixSetForceCuda_PostForce(cuda_shared_data* sdata, int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,F_FLOAT* foriginal,int flagx,int flagy,int flagz);
--- a/lib/cuda/fix_set_force_cuda_kernel.cu
+++ b/lib/cuda/fix_set_force_cuda_kernel.cu
@ -1,79 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-extern __shared__ F_FLOAT sharedmem[];
-
-
-__global__ void Cuda_FixSetForceCuda_PostForce_Kernel(int groupbit,F_FLOAT xvalue,F_FLOAT yvalue,F_FLOAT zvalue,int flagx,int flagy,int flagz)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-      sharedmem[threadIdx.x]=0;
-      sharedmem[threadIdx.x+blockDim.x]=0;
-      sharedmem[threadIdx.x+2*blockDim.x]=0;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-      sharedmem[threadIdx.x]=_f[i];
-      sharedmem[threadIdx.x+blockDim.x]=_f[i+1*_nmax];
-      sharedmem[threadIdx.x+2*blockDim.x]=_f[i+2*_nmax];
-
-      if(flagx) _f[i] = xvalue;
-      if(flagy) _f[i+1*_nmax] = yvalue;
-      if(flagz) _f[i+2*_nmax] = zvalue;
-    }
- 
-    
-  reduceBlock(sharedmem);
-  reduceBlock(&sharedmem[blockDim.x]);
-  reduceBlock(&sharedmem[2*blockDim.x]);
-  F_FLOAT* buffer=(F_FLOAT*)_buffer;
-  if(threadIdx.x==0) 
-  {
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y]=sharedmem[0];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+gridDim.x*gridDim.y]=sharedmem[blockDim.x];	
-  	buffer[blockIdx.x*gridDim.y+blockIdx.y+2*gridDim.x*gridDim.y]=sharedmem[2*blockDim.x];	
-  }
-}
-
-
-__global__ void Cuda_FixSetForceCuda_Reduce_FOriginal(int n,F_FLOAT* foriginal)
-{
-	int i=0;
-    sharedmem[threadIdx.x]=0;
-    F_FLOAT myforig=0.0;
-    F_FLOAT* buf=(F_FLOAT*)_buffer;
-    buf=&buf[blockIdx.x*n];
-	while(i<n)
-	{
-      sharedmem[threadIdx.x]=0;
-	  if(i+threadIdx.x<n)
-      sharedmem[threadIdx.x]=buf[i+threadIdx.x];
-      __syncthreads();
-	  reduceBlock(sharedmem);
-      i+=blockDim.x;
-      if(threadIdx.x==0)
-      myforig+=sharedmem[0];
-	}
-	if(threadIdx.x==0)
-	foriginal[blockIdx.x]=myforig;
-}
-
--- a/lib/cuda/fix_shake_cuda.cu
+++ b/lib/cuda/fix_shake_cuda.cu
@ -1,275 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_shake_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "fix_shake_cuda_cu.h"
-#include "cuda_pair_virial_kernel_nc.cu"
-
-#define _shake_atom           MY_AP(shake_atom)
-#define _shake_type           MY_AP(shake_type)
-#define _shake_flag           MY_AP(shake_flag)
-#define _xshake               MY_AP(xshake)
-#define _dtfsq                MY_AP(dtfsq)
-#define _bond_distance        MY_AP(bond_distance)
-#define _angle_distance       MY_AP(angle_distance)
-#define _max_iter			  MY_AP(max_iter)
-#define _tolerance			  MY_AP(tolerance)
-__device__ __constant__ int* _shake_atom;
-__device__ __constant__ int* _shake_type;
-__device__ __constant__ int* _shake_flag;
-__device__ __constant__ X_FLOAT3* _xshake;
-__device__ __constant__ F_FLOAT _dtfsq;
-__device__ __constant__ X_FLOAT* _bond_distance;
-__device__ __constant__ X_FLOAT* _angle_distance;
-__device__ __constant__ int _max_iter;
-__device__ __constant__ X_FLOAT _tolerance;
-
-#include "fix_shake_cuda_kernel.cu"
-
-void Cuda_FixShakeCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(tag)     , & sdata->atom.tag  .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(rmass)   , & sdata->atom.rmass.dev_data, sizeof(V_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(map_array), & sdata->atom.map_array .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(vatom)   , & sdata->atom.vatom.dev_data, sizeof(ENERGY_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(debugdata), & sdata->debugdata         , sizeof(int*)     );
-}
-
-void Cuda_FixShakeCuda_UpdateDomain(cuda_shared_data* sdata)
-{
-	cudaMemcpyToSymbol(MY_CONST(periodicity), sdata->domain.periodicity		, sizeof(int)*3    );
-	cudaMemcpyToSymbol(MY_CONST(prd)		, sdata->domain.prd				, sizeof(X_FLOAT)*3    );
-	cudaMemcpyToSymbol(MY_CONST(triclinic)  , &sdata->domain.triclinic		, sizeof(int)      );
-	cudaMemcpyToSymbol(MY_CONST(h)			, sdata->domain.h				, sizeof(X_FLOAT)*6    );
-}
-
-void Cuda_FixShakeCuda_UpdateBuffer(cuda_shared_data* sdata,int size)
-{
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_FixShakeCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-			
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer) , & sdata->buffer, sizeof(int*)     );
-}
-
-void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq,
-						void* shake_flag,void* shake_atom,void* shake_type, void* xshake,
-						void* bond_distance,void* angle_distance,void* virial,
-						int max_iter,X_FLOAT tolerance)
-{
-	Cuda_FixShakeCuda_UpdateNmax(sdata);
-	Cuda_FixShakeCuda_UpdateDomain(sdata);
-	cudaMemcpyToSymbol(MY_CONST(shake_atom)        , & shake_atom 	  , sizeof(void*) );
-	cudaMemcpyToSymbol(MY_CONST(shake_type)        , & shake_type 	  , sizeof(void*) );
-	cudaMemcpyToSymbol(MY_CONST(shake_flag)        , & shake_flag 	  , sizeof(void*) );
-	cudaMemcpyToSymbol(MY_CONST(xshake)            , & xshake     	  , sizeof(void*) );
-	cudaMemcpyToSymbol(MY_CONST(dtv)               , & dtv        	  , sizeof(X_FLOAT));
-	cudaMemcpyToSymbol(MY_CONST(dtfsq)             , & dtfsq      	  , sizeof(F_FLOAT));
-	cudaMemcpyToSymbol(MY_CONST(bond_distance)     , & bond_distance  , sizeof(void*) );
-	cudaMemcpyToSymbol(MY_CONST(angle_distance)    , & angle_distance , sizeof(void*) );
-	cudaMemcpyToSymbol(MY_CONST(virial)     	   , & virial  		  , sizeof(void*) );
-	cudaMemcpyToSymbol(MY_CONST(flag)  			   , &sdata->flag	  , sizeof(int*)  );
-	cudaMemcpyToSymbol(MY_CONST(max_iter)  		   , &max_iter  	  , sizeof(int)   );
-	cudaMemcpyToSymbol(MY_CONST(tolerance)  	   , &tolerance  	  , sizeof(X_FLOAT));
-	
-	if(sdata->atom.mass_host)
-	cudaMemcpyToSymbol(MY_CONST(mass),& sdata->atom.mass.dev_data , sizeof(V_FLOAT*) );
-	cudaMemcpyToSymbol(MY_CONST(rmass_flag), & sdata->atom.rmass_flag       , sizeof(int)      ); //
-
-    cudaMemcpyToSymbol(MY_CONST(flag)  , &sdata->flag, sizeof(int*));
-	
-}
-
-void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixShakeCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	if(sdata->buffer_new)
-		Cuda_FixShakeCuda_UpdateBuffer(sdata,10*sizeof(double));
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	FixShakeCuda_UnconstrainedUpdate_Kernel<<<grid, threads>>> ();
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("FixShakeCuda_UnconstrainedUpdate: Kernel execution failed");	
-}
-
-void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixShakeCuda_UpdateNmax(sdata);
-	if(sdata->domain.update) 
-		Cuda_FixShakeCuda_UpdateDomain(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal , sizeof(int)); 
-	int3 layout=getgrid(sdata->atom.nlocal,6*sizeof(ENERGY_FLOAT),64);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->buffer_new)
-		Cuda_FixShakeCuda_UpdateBuffer(sdata,grid.x*grid.y*6*sizeof(ENERGY_FLOAT));
-	
-	BindXTypeTexture(sdata);
-	
-	FixShakeCuda_Shake_Kernel<<<grid, threads,6*threads.x*sizeof(ENERGY_FLOAT)>>> (vflag,vflag_atom,list,nlist);
-	cudaThreadSynchronize();
-	
-	CUT_CHECK_ERROR("FixShakeCuda_Shake: Kernel execution failed");	
-
-    if(vflag) 
-	{
-		int n=grid.x*grid.y;
-		grid.x=6;
-		grid.y=1;
-		threads.x=256;
-		MY_AP(PairVirialCompute_reduce)<<<grid,threads,threads.x*sizeof(ENERGY_FLOAT)>>>(n);
-		cudaThreadSynchronize();
-		CUT_CHECK_ERROR("Cuda_FixShakeCuda: (no binning) virial compute Kernel execution failed");
-	}
-	
-}
-
-int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixShakeCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
-
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	  
-	if(sdata->atom.nlocal>0)
-	{
-	  cudaMemset( sdata->flag,0,sizeof(int));
-	  FixShakeCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz);
-	  cudaThreadSynchronize();
-	  cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
-	  int aflag;
-	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
-	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
-	  CUT_CHECK_ERROR("Cuda_FixShakeCuda_PackComm: Kernel execution failed");
-		
-	}		
-    return 3*n;
-}
-
-int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixShakeCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
-	static int count=-1;
-	count++;
-	X_FLOAT dx=0.0;
-	X_FLOAT dy=0.0;
-	X_FLOAT dz=0.0;
- 	if (pbc_flag != 0) {
-    if (sdata->domain.triclinic == 0) {
-      dx = pbc[0]*sdata->domain.prd[0];
-      dy = pbc[1]*sdata->domain.prd[1];
-      dz = pbc[2]*sdata->domain.prd[2];
-    } else {
-      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
-      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
-      dz = pbc[2]*sdata->domain.prd[2];
-    }}	
-
-
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-	  FixShakeCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
-	}	
-	
-    return 3*n;
-}
-
-void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixShakeCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	int size=n*3*sizeof(X_FLOAT);
-	if(sdata->buffer_new or (size>sdata->buffersize))
-		Cuda_FixShakeCuda_UpdateBuffer(sdata,size);
-
-	int3 layout=getgrid(n);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	if(sdata->atom.nlocal>0)
-	{
-	  cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
-	  FixShakeCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first);
-	  cudaThreadSynchronize();
-	  CUT_CHECK_ERROR("Cuda_FixShakeCuda_UnpackComm: Kernel execution failed");
-		
-	}		
-}
--- a/lib/cuda/fix_shake_cuda_cu.h
+++ b/lib/cuda/fix_shake_cuda_cu.h
@ -1,34 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixShakeCuda_Init(cuda_shared_data* sdata,X_FLOAT dtv, F_FLOAT dtfsq,
-						void* shake_flag,void* shake_atom,void* shake_type, void* xshake,
-						void* bond_distance,void* angle_distance,void* virial,
-						int max_iter,X_FLOAT tolerance);
-extern "C" void Cuda_FixShakeCuda_UnconstrainedUpdate(cuda_shared_data* sdata);
-extern "C" void Cuda_FixShakeCuda_Shake(cuda_shared_data* sdata,int vflag,int vflag_atom,int* list,int nlist);
-extern "C" int Cuda_FixShakeCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag);
-extern "C" int Cuda_FixShakeCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag);
-extern "C" void Cuda_FixShakeCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv);
-
--- a/lib/cuda/fix_shake_cuda_kernel.cu
+++ b/lib/cuda/fix_shake_cuda_kernel.cu
@ -1,971 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__device__ void v_tally(int& vflag_global,int& vflag_atom,int& n, int *list, ENERGY_FLOAT total, ENERGY_FLOAT *v)
-{
-  /*if(vflag_global)
-  {
-    ENERGY_FLOAT fraction = n/total;
-	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
-    *shared += fraction*v[0]; shared+=blockDim.x;
-    *shared += fraction*v[1]; shared+=blockDim.x;
-    *shared += fraction*v[2]; shared+=blockDim.x;
-    *shared += fraction*v[3]; shared+=blockDim.x;
-    *shared += fraction*v[4]; shared+=blockDim.x;
-    *shared += fraction*v[5];	
-  }*/
-  if (vflag_atom) {
-    ENERGY_FLOAT fraction = ENERGY_F(1.0)/total;
-    for (int i = 0; i < n; i++) {
-      int m = list[i];
-      ENERGY_FLOAT* myvatom=&_vatom[m];
-      
-      *myvatom += fraction*v[0]; myvatom+=_nmax;
-      *myvatom += fraction*v[1]; myvatom+=_nmax;
-      *myvatom += fraction*v[2]; myvatom+=_nmax;
-      *myvatom += fraction*v[3]; myvatom+=_nmax;
-      *myvatom += fraction*v[4]; myvatom+=_nmax;
-      *myvatom += fraction*v[5];
-    }
-  }
-}
-
-inline __device__ void minimum_image(X_FLOAT3& delta)
-{
-  if (_triclinic == 0) {
-    if (_periodicity[0]) {
-    	delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
-    	          (delta.x >  X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
-    }
-    if (_periodicity[1]) {
-    	delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
-    	          (delta.y >  X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
-    }
-    if (_periodicity[2]) {
-    	delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
-    }
-
-  } else {
-    if (_periodicity[1]) {
-    	delta.z += delta.z < -X_F(0.5)*_prd[2] ? _prd[2] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_prd[2] : X_F(0.0));
-   		delta.y += delta.z < -X_F(0.5)*_prd[2] ? _h[3] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_h[3] : X_F(0.0));
-     	delta.x += delta.z < -X_F(0.5)*_prd[2] ? _h[4] :
-    	          (delta.z >  X_F(0.5)*_prd[2] ?-_h[4] : X_F(0.0));
-    	          
-    }
-    if (_periodicity[1]) {
-    	delta.y += delta.y < -X_F(0.5)*_prd[1] ? _prd[1] :
-    	          (delta.y >  X_F(0.5)*_prd[1] ?-_prd[1] : X_F(0.0));
-    	delta.x += delta.y < -X_F(0.5)*_prd[1] ? _h[5] :
-    	          (delta.y >  X_F(0.5)*_prd[1] ?-_h[5] : X_F(0.0));
-    	          
-    }
-    if (_periodicity[0]) {
-    	delta.x += delta.x < -X_F(0.5)*_prd[0] ? _prd[0] :
-    	          (delta.x >  X_F(0.5)*_prd[0] ?-_prd[0] : X_F(0.0));
-    }
-  }
-}
-
-__global__ void FixShakeCuda_UnconstrainedUpdate_Kernel()
-{
-	int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-	if(i>=_nlocal) return;
-	
-	X_FLOAT3 my_xshake = {X_F(0.0),X_F(0.0),X_F(0.0)};
-	if(_shake_flag[i])
-	{
-		F_FLOAT* my_f = _f + i;
-		V_FLOAT* my_v = _v + i;
-		X_FLOAT* my_x = _x + i;
-			
-		V_FLOAT 		dtfmsq = _dtfsq;
-		if(_rmass_flag) dtfmsq*= V_F(1.0) / _rmass[i];
-		else 			dtfmsq*= V_F(1.0) / _mass[_type[i]];
-		
-		my_xshake.x =  *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; 
-		my_xshake.y =  *my_x + _dtv* *my_v + dtfmsq* *my_f; my_f += _nmax; my_v += _nmax; my_x += _nmax; 
-		my_xshake.z =  *my_x + _dtv* *my_v + dtfmsq* *my_f; 
-	}
-	_xshake[i]=my_xshake;
-}
-
-
-
-
-__device__ void FixShakeCuda_Shake2(int& vflag,int& vflag_atom,int& m)
-{
-  int nlist,list[2];
-  ENERGY_FLOAT v[6];
-  X_FLOAT invmass0,invmass1;
-
-  // local atom IDs and constraint distances
-
-  int i0 = _map_array[_shake_atom[m]];
-  int i1 = _map_array[_shake_atom[m+_nmax]];
-  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
-
-  // r01 = distance vec between atoms, with PBC
-  
-  X_FLOAT3 r01;
-  
-  X_FLOAT4 x_i0,x_i1;
-  x_i0=fetchXType(i0);
-  x_i1=fetchXType(i1);
-  
-  r01.x = x_i0.x - x_i1.x;
-  r01.y = x_i0.y - x_i1.y;
-  r01.z = x_i0.z - x_i1.z;
-  minimum_image(r01);
-
-  // s01 = distance vec after unconstrained update, with PBC
-
-  X_FLOAT3 s01;
-  X_FLOAT3 xs_i0=_xshake[i0];
-  X_FLOAT3 xs_i1=_xshake[i1];
-  
-  s01.x = xs_i0.x - xs_i1.x;
-  s01.y = xs_i0.y - xs_i1.y;
-  s01.z = xs_i0.z - xs_i1.z;
-  minimum_image(s01);
-
-  // scalar distances between atoms
-
-  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
-  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
-
-  // a,b,c = coeffs in quadratic equation for lamda
-  
-  if (_rmass_flag) {
-    invmass0 = X_F(1.0)/_rmass[i0];
-    invmass1 = X_F(1.0)/_rmass[i1];
-  } else {
-    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
-    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
-  }
-
-  X_FLOAT a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
-  X_FLOAT b = X_F(2.0) * (invmass0+invmass1) *
-    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
-  X_FLOAT c = s01sq - bond1*bond1;
-
-  // error check
-
-  X_FLOAT determ = b*b - X_F(4.0)*a*c;
-  if (determ < X_F(0.0)) {
-    _flag[0]++;
-    determ = X_F(0.0);
-  }
-
-  // exact quadratic solution for lamda
-
-  X_FLOAT lamda,lamda1,lamda2;
-  lamda1 = -b+_SQRT_(determ);
-  lamda2 = -lamda1 - X_F(2.0)*b;
-  lamda1 *= X_F(1.0) / (X_F(2.0)*a);
-  lamda2 *= X_F(1.0) / (X_F(2.0)*a);
-
-  lamda = (fabs(lamda1) <= fabs(lamda2))? lamda1 : lamda2;
-
-  // update forces if atom is owned by this processor
-
-  lamda*= X_F(1.0)/_dtfsq;
-
-
-  //attenion: are shake clusters <-> atom unique?
-    nlist = 0;
-  if (i0 < _nlocal) {
-    _f[i0]         += lamda*r01.x;
-    _f[i0+_nmax]   += lamda*r01.y;
-    _f[i0+2*_nmax] += lamda*r01.z;
-    list[nlist++] = i0;
-  }
-
-  if (i1 < _nlocal) {
-    _f[i1]         -= lamda*r01.x;
-    _f[i1+_nmax]   -= lamda*r01.y;
-    _f[i1+2*_nmax] -= lamda*r01.z;
-    list[nlist++] = i1;
-  }
- 
-  if (vflag||vflag_atom) {
-  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
-  	X_FLOAT factor=nlist;
-    v[0] = lamda*r01.x*r01.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
-    v[1] = lamda*r01.y*r01.y; *shared = factor*v[1]; shared+=blockDim.x;
-    v[2] = lamda*r01.z*r01.z; *shared = factor*v[2]; shared+=blockDim.x;
-    v[3] = lamda*r01.x*r01.y; *shared = factor*v[3]; shared+=blockDim.x;
-    v[4] = lamda*r01.x*r01.z; *shared = factor*v[4]; shared+=blockDim.x;
-    v[5] = lamda*r01.y*r01.z; *shared = factor*v[5]; shared+=blockDim.x;
-    
-    v_tally(vflag,vflag_atom,nlist,list,2.0,v);
-  }
-}
-
-
-__device__ void FixShakeCuda_Shake3(int& vflag,int& vflag_atom,int& m)
-{
-  int nlist,list[3];
-  ENERGY_FLOAT v[6];
-  X_FLOAT invmass0,invmass1,invmass2;
-
-  // local atom IDs and constraint distances
-
-  int i0 = _map_array[_shake_atom[m]];
-  int i1 = _map_array[_shake_atom[m+_nmax]];
-  int i2 = _map_array[_shake_atom[m+2*_nmax]];
-  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
-  X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
-
-  // r01 = distance vec between atoms, with PBC
-  
-  X_FLOAT3 r01,r02;
-  
-  X_FLOAT4 x_i0,x_i1,x_i2;
-  x_i0=fetchXType(i0);
-  x_i1=fetchXType(i1);
-  x_i2=fetchXType(i2);
-  
-  r01.x = x_i0.x - x_i1.x;
-  r01.y = x_i0.y - x_i1.y;
-  r01.z = x_i0.z - x_i1.z;
-  minimum_image(r01);
-
-  r02.x = x_i0.x - x_i2.x;
-  r02.y = x_i0.y - x_i2.y;
-  r02.z = x_i0.z - x_i2.z;
-  minimum_image(r02);
-
-  // s01 = distance vec after unconstrained update, with PBC
-
-  X_FLOAT3 s01,s02;
-  X_FLOAT3 xs_i0=_xshake[i0];
-  X_FLOAT3 xs_i1=_xshake[i1];
-  X_FLOAT3 xs_i2=_xshake[i2];
-  
-  s01.x = xs_i0.x - xs_i1.x;
-  s01.y = xs_i0.y - xs_i1.y;
-  s01.z = xs_i0.z - xs_i1.z;
-  minimum_image(s01);
-
-  s02.x = xs_i0.x - xs_i2.x;
-  s02.y = xs_i0.y - xs_i2.y;
-  s02.z = xs_i0.z - xs_i2.z;
-  minimum_image(s02);
-
-  // scalar distances between atoms
-
-  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
-  X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
-  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
-  X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
-
-  // a,b,c = coeffs in quadratic equation for lamda
-  
-  if (_rmass_flag) {
-    invmass0 = X_F(1.0)/_rmass[i0];
-    invmass1 = X_F(1.0)/_rmass[i1];
-    invmass2 = X_F(1.0)/_rmass[i2];
-  } else {
-    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
-    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
-    invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
-  }
-
-  X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
-    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
-  X_FLOAT a12 = X_F(2.0) * invmass0 *
-    (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
-  X_FLOAT a21 = X_F(2.0) * invmass0 *
-    (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
-  X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
-    (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
-
-  // error check
-
-  X_FLOAT determ = a11*a22 - a12*a21;
-  if (determ == X_F(0.0)) _flag[0]++;
-  X_FLOAT determinv = X_F(1.0)/determ;
-  
-  X_FLOAT a11inv = a22*determinv;
-  X_FLOAT a12inv = -a12*determinv;
-  X_FLOAT a21inv = -a21*determinv;
-  X_FLOAT a22inv = a11*determinv;
-
-  // quadratic correction coeffs
-
-  X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
-
-  X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
-  X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
-  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
-
-  X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
-  X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
-  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
-
-  // iterate until converged
-
-  X_FLOAT lamda01 = X_F(0.0);
-  X_FLOAT lamda02 = X_F(0.0);
-  int niter = 0;
-  int done = 0;
-
-  X_FLOAT quad1,quad2,b1,b2,lamda01_new,lamda02_new;
-
-//maybe all running full loop?
-  while (__any(!done) && niter < _max_iter) {
-    quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 + 
-      quad1_0102 * lamda01*lamda02;
-    quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 + 
-      quad2_0102 * lamda01*lamda02;
-        
-    b1 = bond1*bond1 - s01sq - quad1;
-    b2 = bond2*bond2 - s02sq - quad2;
-        
-    lamda01_new = a11inv*b1 + a12inv*b2;
-    lamda02_new = a21inv*b1 + a22inv*b2;
-
-    done++;
-    done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done;
-    done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done;
-
-	
-    lamda01 = done<2?lamda01_new:lamda01;
-    lamda02 = done<2?lamda02_new:lamda02;
-    niter++;
-  }
-  // update forces if atom is owned by this processor
-
-  lamda01 *= X_F(1.0)/_dtfsq;
-  lamda02 *= X_F(1.0)/_dtfsq;
-
-
-  //attenion: are shake clusters <-> atom unique?
-    nlist = 0;
-  if (i0 < _nlocal) {
-    _f[i0] += lamda01*r01.x + lamda02*r02.x;
-    _f[i0+_nmax] += lamda01*r01.y + lamda02*r02.y;
-    _f[i0+2*_nmax] += lamda01*r01.z + lamda02*r02.z;
-    list[nlist++] = i0;
-  }
-
-  if (i1 < _nlocal) {
-    _f[i1] -= lamda01*r01.x;
-    _f[i1+_nmax] -= lamda01*r01.y;
-    _f[i1+2*_nmax] -= lamda01*r01.z;
-    list[nlist++] = i1;
-  }
-
-  if (i2 < _nlocal) {
-    _f[i2] -= lamda02*r02.x;
-    _f[i2+_nmax] -= lamda02*r02.y;
-    _f[i2+2*_nmax] -= lamda02*r02.z;
-    list[nlist++] = i2;
-  }
-
-  if (vflag||vflag_atom) {
-  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
-  	X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist;
-    v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
-    v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y; *shared = factor*v[1]; shared+=blockDim.x;
-    v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z; *shared = factor*v[2]; shared+=blockDim.x;
-    v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y; *shared = factor*v[3]; shared+=blockDim.x;
-    v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z; *shared = factor*v[4]; shared+=blockDim.x;
-    v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z; *shared = factor*v[5]; shared+=blockDim.x;
-    
-    v_tally(vflag,vflag_atom,nlist,list,3.0,v);
-  }
-}
-
-__device__ void FixShakeCuda_Shake4(int& vflag,int& vflag_atom,int& m)
-{
-  int nlist,list[4];
-  ENERGY_FLOAT v[6];
-  X_FLOAT invmass0,invmass1,invmass2,invmass3;
-
-  // local atom IDs and constraint distances
-
-  int i0 = _map_array[_shake_atom[m]];
-  int i1 = _map_array[_shake_atom[m+_nmax]];
-  int i2 = _map_array[_shake_atom[m+2*_nmax]];
-  int i3 = _map_array[_shake_atom[m+3*_nmax]];
-  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
-  X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
-  X_FLOAT bond3 = _bond_distance[_shake_type[m+2*_nmax]];
-
-  // r01 = distance vec between atoms, with PBC
-  
-  X_FLOAT3 r01,r02,r03;
-  
-  X_FLOAT4 x_i0,x_i1,x_i2,x_i3;
-  x_i0=fetchXType(i0);
-  x_i1=fetchXType(i1);
-  x_i2=fetchXType(i2);
-  x_i3=fetchXType(i3);
-  
-  r01.x = x_i0.x - x_i1.x;
-  r01.y = x_i0.y - x_i1.y;
-  r01.z = x_i0.z - x_i1.z;
-  minimum_image(r01);
-
-  r02.x = x_i0.x - x_i2.x;
-  r02.y = x_i0.y - x_i2.y;
-  r02.z = x_i0.z - x_i2.z;
-  minimum_image(r02);
-
-  r03.x = x_i0.x - x_i3.x;
-  r03.y = x_i0.y - x_i3.y;
-  r03.z = x_i0.z - x_i3.z;
-  minimum_image(r03);
-
-  // s01 = distance vec after unconstrained update, with PBC
-
-  X_FLOAT3 s01,s02,s03;
-  X_FLOAT3 xs_i0=_xshake[i0];
-  X_FLOAT3 xs_i1=_xshake[i1];
-  X_FLOAT3 xs_i2=_xshake[i2];
-  X_FLOAT3 xs_i3=_xshake[i3];
-  
-  s01.x = xs_i0.x - xs_i1.x;
-  s01.y = xs_i0.y - xs_i1.y;
-  s01.z = xs_i0.z - xs_i1.z;
-  minimum_image(s01);
-
-  s02.x = xs_i0.x - xs_i2.x;
-  s02.y = xs_i0.y - xs_i2.y;
-  s02.z = xs_i0.z - xs_i2.z;
-  minimum_image(s02);
-
-  s03.x = xs_i0.x - xs_i3.x;
-  s03.y = xs_i0.y - xs_i3.y;
-  s03.z = xs_i0.z - xs_i3.z;
-  minimum_image(s03);
-  
-  // scalar distances between atoms
-
-  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
-  X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
-  X_FLOAT r03sq = r03.x*r03.x + r03.y*r03.y + r03.z*r03.z;
-  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
-  X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
-  X_FLOAT s03sq = s03.x*s03.x + s03.y*s03.y + s03.z*s03.z;
-
-  // a,b,c = coeffs in quadratic equation for lamda
-  
-  if (_rmass_flag) {
-    invmass0 = X_F(1.0)/_rmass[i0];
-    invmass1 = X_F(1.0)/_rmass[i1];
-    invmass2 = X_F(1.0)/_rmass[i2];
-    invmass3 = X_F(1.0)/_rmass[i3];
-  } else {
-    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
-    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
-    invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
-    invmass3 = X_F(1.0)/_mass[static_cast <int> (x_i3.w)];
-  }
-
-  X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
-    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
-  X_FLOAT a12 = X_F(2.0) * invmass0 *
-    (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
-  X_FLOAT a13 = X_F(2.0) * invmass0 *
-    (s01.x*r03.x + s01.y*r03.y + s01.z*r03.z);
-  X_FLOAT a21 = X_F(2.0) * invmass0 *
-    (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
-  X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
-    (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
-  X_FLOAT a23 = X_F(2.0) * (invmass0) *
-    (s02.x*r03.x + s02.y*r03.y + s02.z*r03.z);
-  X_FLOAT a31 = X_F(2.0) * (invmass0) *
-    (s03.x*r01.x + s03.y*r01.y + s03.z*r01.z);
-  X_FLOAT a32 = X_F(2.0) * (invmass0) *
-    (s03.x*r02.x + s03.y*r02.y + s03.z*r02.z);
-  X_FLOAT a33 = X_F(2.0) * (invmass0+invmass3) *
-    (s03.x*r03.x + s03.y*r03.y + s03.z*r03.z);
-
-  // error check
-
-  X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
-    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
-  if (determ == X_F(0.0)) _flag[0]++;
-  X_FLOAT determinv = X_F(1.0)/determ;
-  
-  X_FLOAT a11inv = determinv * (a22*a33 - a23*a32);
-  X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32);
-  X_FLOAT a13inv = determinv * (a12*a23 - a13*a22);
-  X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31);
-  X_FLOAT a22inv = determinv * (a11*a33 - a13*a31);
-  X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21);
-  X_FLOAT a31inv = determinv * (a21*a32 - a22*a31);
-  X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31);
-  X_FLOAT a33inv = determinv * (a11*a22 - a12*a21);
-  
-  // quadratic correction coeffs
-
-  X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
-  X_FLOAT r0103 = (r01.x*r03.x + r01.y*r03.y + r01.z*r03.z);
-  X_FLOAT r0203 = (r02.x*r03.x + r02.y*r03.y + r02.z*r03.z);
-
-  X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
-  X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
-  X_FLOAT quad1_0303 = invmass0*invmass0 * r03sq;
-  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
-  X_FLOAT quad1_0103 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0103;
-  X_FLOAT quad1_0203 = X_F(2.0) * invmass0*invmass0 * r0203;
-
-  X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
-  X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
-  X_FLOAT quad2_0303 = invmass0*invmass0 * r03sq;
-  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
-  X_FLOAT quad2_0103 = X_F(2.0) * invmass0*invmass0 * r0103;
-  X_FLOAT quad2_0203 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0203;
-
-  X_FLOAT quad3_0101 = invmass0*invmass0 * r01sq;
-  X_FLOAT quad3_0202 = invmass0*invmass0 * r02sq;
-  X_FLOAT quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq;
-  X_FLOAT quad3_0102 = X_F(2.0) * invmass0*invmass0 * r0102;
-  X_FLOAT quad3_0103 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0103;
-  X_FLOAT quad3_0203 = X_F(2.0) * (invmass0+invmass3)*invmass0 * r0203;
-  // iterate until converged
-
-  X_FLOAT lamda01 = X_F(0.0);
-  X_FLOAT lamda02 = X_F(0.0);
-  X_FLOAT lamda03 = X_F(0.0);
-  int niter = 0;
-  int done = 0;
-
-  X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new;
-  
-//maybe all running full loop?
-  while (__any(!done) && niter < _max_iter) {
-    quad1 = quad1_0101 * lamda01*lamda01 + 
-      quad1_0202 * lamda02*lamda02 +
-      quad1_0303 * lamda03*lamda03 + 
-      quad1_0102 * lamda01*lamda02 +
-      quad1_0103 * lamda01*lamda03 +
-      quad1_0203 * lamda02*lamda03;
-
-    quad2 = quad2_0101 * lamda01*lamda01 + 
-      quad2_0202 * lamda02*lamda02 +
-      quad2_0303 * lamda03*lamda03 + 
-      quad2_0102 * lamda01*lamda02 +
-      quad2_0103 * lamda01*lamda03 +
-      quad2_0203 * lamda02*lamda03;
-
-    quad3 = quad3_0101 * lamda01*lamda01 + 
-      quad3_0202 * lamda02*lamda02 +
-      quad3_0303 * lamda03*lamda03 + 
-      quad3_0102 * lamda01*lamda02 +
-      quad3_0103 * lamda01*lamda03 +
-      quad3_0203 * lamda02*lamda03;
-
-    b1 = bond1*bond1 - s01sq - quad1;
-    b2 = bond2*bond2 - s02sq - quad2;
-    b3 = bond3*bond3 - s03sq - quad3;
-        
-    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
-    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
-    lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
-
-    done++;
-    done = (fabs(lamda01_new-lamda01) > _tolerance)? 0:done;
-    done = (fabs(lamda02_new-lamda02) > _tolerance)? 0:done;
-    done = (fabs(lamda03_new-lamda03) > _tolerance)? 0:done;
-
-    lamda01 = done<2?lamda01_new:lamda01;
-    lamda02 = done<2?lamda02_new:lamda02;
-    lamda03 = done<2?lamda03_new:lamda03;
-    niter++;
-  }
-  // update forces if atom is owned by this processor
-
-  lamda01 *= X_F(1.0)/_dtfsq;
-  lamda02 *= X_F(1.0)/_dtfsq;
-  lamda03 *= X_F(1.0)/_dtfsq;
-
-
-  //attenion: are shake clusters <-> atom unique?
-    nlist = 0;
-  if (i0 < _nlocal) {
-    _f[i0] 			+= lamda01*r01.x + lamda02*r02.x + lamda03*r03.x;
-    _f[i0+_nmax] 	+= lamda01*r01.y + lamda02*r02.y + lamda03*r03.y;
-    _f[i0+2*_nmax] 	+= lamda01*r01.z + lamda02*r02.z + lamda03*r03.z;
-    list[nlist++] = i0;
-  }
-
-  if (i1 < _nlocal) {
-    _f[i1] -= lamda01*r01.x;
-    _f[i1+_nmax] -= lamda01*r01.y;
-    _f[i1+2*_nmax] -= lamda01*r01.z;
-    list[nlist++] = i1;
-  }
-
-  if (i2 < _nlocal) {
-    _f[i2] -= lamda02*r02.x;
-    _f[i2+_nmax] -= lamda02*r02.y;
-    _f[i2+2*_nmax] -= lamda02*r02.z;
-    list[nlist++] = i2;
-  }
-
-  if (i3 < _nlocal) {
-    _f[i3] -= lamda03*r03.x;
-    _f[i3+_nmax] -= lamda03*r03.y;
-    _f[i3+2*_nmax] -= lamda03*r03.z;
-    list[nlist++] = i3;
-  }
-
-  if (vflag||vflag_atom) {
-  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
-  	X_FLOAT factor=X_F(2.0)/X_F(4.0)*nlist;
-    v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda03*r03.x*r03.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
-    v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda03*r03.y*r03.y; *shared = factor*v[1]; shared+=blockDim.x;
-    v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda03*r03.z*r03.z; *shared = factor*v[2]; shared+=blockDim.x;
-    v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda03*r03.x*r03.y; *shared = factor*v[3]; shared+=blockDim.x;
-    v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda03*r03.x*r03.z; *shared = factor*v[4]; shared+=blockDim.x;
-    v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda03*r03.y*r03.z; *shared = factor*v[5]; shared+=blockDim.x;
-    
-    v_tally(vflag,vflag_atom,nlist,list,4.0,v);
-  }
-}
-
-__device__ void FixShakeCuda_Shake3Angle(int& vflag,int& vflag_atom,int& m)
-{
-  int nlist,list[3];
-  ENERGY_FLOAT v[6];
-  X_FLOAT invmass0,invmass1,invmass2;
-
-  // local atom IDs and constraint distances
-
-  int i0 = _map_array[_shake_atom[m]];
-  int i1 = _map_array[_shake_atom[m+_nmax]];
-  int i2 = _map_array[_shake_atom[m+2*_nmax]];
-  X_FLOAT bond1 = _bond_distance[_shake_type[m]];
-  X_FLOAT bond2 = _bond_distance[_shake_type[m+_nmax]];
-  X_FLOAT bond12 = _angle_distance[_shake_type[m+2*_nmax]];
-
-  // r01 = distance vec between atoms, with PBC
-  
-  X_FLOAT3 r01,r02,r12;
-  
-  X_FLOAT4 x_i0,x_i1,x_i2;
-  x_i0=fetchXType(i0);
-  x_i1=fetchXType(i1);
-  x_i2=fetchXType(i2);
-  
-  r01.x = x_i0.x - x_i1.x;
-  r01.y = x_i0.y - x_i1.y;
-  r01.z = x_i0.z - x_i1.z;
-  minimum_image(r01);
-
-  r02.x = x_i0.x - x_i2.x;
-  r02.y = x_i0.y - x_i2.y;
-  r02.z = x_i0.z - x_i2.z;
-  minimum_image(r02);
-
-  r12.x = x_i1.x - x_i2.x;
-  r12.y = x_i1.y - x_i2.y;
-  r12.z = x_i1.z - x_i2.z;
-  minimum_image(r12);
-
-  // s01 = distance vec after unconstrained update, with PBC
-
-  X_FLOAT3 s01,s02,s12;
-  X_FLOAT3 xs_i0=_xshake[i0];
-  X_FLOAT3 xs_i1=_xshake[i1];
-  X_FLOAT3 xs_i2=_xshake[i2];
-  
-  s01.x = xs_i0.x - xs_i1.x;
-  s01.y = xs_i0.y - xs_i1.y;
-  s01.z = xs_i0.z - xs_i1.z;
-  minimum_image(s01);
-
-  s02.x = xs_i0.x - xs_i2.x;
-  s02.y = xs_i0.y - xs_i2.y;
-  s02.z = xs_i0.z - xs_i2.z;
-  minimum_image(s02);
-
-  s12.x = xs_i1.x - xs_i2.x;
-  s12.y = xs_i1.y - xs_i2.y;
-  s12.z = xs_i1.z - xs_i2.z;
-  minimum_image(s12);
-  
-  // scalar distances between atoms
-
-  X_FLOAT r01sq = r01.x*r01.x + r01.y*r01.y + r01.z*r01.z;
-  X_FLOAT r02sq = r02.x*r02.x + r02.y*r02.y + r02.z*r02.z;
-  X_FLOAT r12sq = r12.x*r12.x + r12.y*r12.y + r12.z*r12.z;
-  X_FLOAT s01sq = s01.x*s01.x + s01.y*s01.y + s01.z*s01.z;
-  X_FLOAT s02sq = s02.x*s02.x + s02.y*s02.y + s02.z*s02.z;
-  X_FLOAT s12sq = s12.x*s12.x + s12.y*s12.y + s12.z*s12.z;
-
-  // a,b,c = coeffs in quadratic equation for lamda
-  
-  if (_rmass_flag) {
-    invmass0 = X_F(1.0)/_rmass[i0];
-    invmass1 = X_F(1.0)/_rmass[i1];
-    invmass2 = X_F(1.0)/_rmass[i2];
-  } else {
-    invmass0 = X_F(1.0)/_mass[static_cast <int> (x_i0.w)];
-    invmass1 = X_F(1.0)/_mass[static_cast <int> (x_i1.w)];
-    invmass2 = X_F(1.0)/_mass[static_cast <int> (x_i2.w)];
-  }
-
-  X_FLOAT a11 = X_F(2.0) * (invmass0+invmass1) *
-    (s01.x*r01.x + s01.y*r01.y + s01.z*r01.z);
-  X_FLOAT a12 = X_F(2.0) * invmass0 *
-    (s01.x*r02.x + s01.y*r02.y + s01.z*r02.z);
-  X_FLOAT a13 = - X_F(2.0) * invmass1 *
-    (s01.x*r12.x + s01.y*r12.y + s01.z*r12.z);
-  X_FLOAT a21 = X_F(2.0) * invmass0 *
-    (s02.x*r01.x + s02.y*r01.y + s02.z*r01.z);
-  X_FLOAT a22 = X_F(2.0) * (invmass0+invmass2) *
-    (s02.x*r02.x + s02.y*r02.y + s02.z*r02.z);
-  X_FLOAT a23 = X_F(2.0) * invmass2 *
-    (s02.x*r12.x + s02.y*r12.y + s02.z*r12.z);
-  X_FLOAT a31 = - X_F(2.0) * invmass1 *
-    (s12.x*r01.x + s12.y*r01.y + s12.z*r01.z);
-  X_FLOAT a32 = X_F(2.0) * invmass2 *
-    (s12.x*r02.x + s12.y*r02.y + s12.z*r02.z);
-  X_FLOAT a33 = X_F(2.0) * (invmass1+invmass2) *
-    (s12.x*r12.x + s12.y*r12.y + s12.z*r12.z);
-
-  // inverse of matrix
-
-  X_FLOAT determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
-    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
-  if (determ == X_F(0.0)) _flag[0]++;
-  X_FLOAT determinv = X_F(1.0)/determ;
-  
-  X_FLOAT a11inv = determinv * (a22*a33 - a23*a32);
-  X_FLOAT a12inv = -determinv * (a12*a33 - a13*a32);
-  X_FLOAT a13inv = determinv * (a12*a23 - a13*a22);
-  X_FLOAT a21inv = -determinv * (a21*a33 - a23*a31);
-  X_FLOAT a22inv = determinv * (a11*a33 - a13*a31);
-  X_FLOAT a23inv = -determinv * (a11*a23 - a13*a21);
-  X_FLOAT a31inv = determinv * (a21*a32 - a22*a31);
-  X_FLOAT a32inv = -determinv * (a11*a32 - a12*a31);
-  X_FLOAT a33inv = determinv * (a11*a22 - a12*a21);
-  
-  // quadratic correction coeffs
-
-  X_FLOAT r0102 = (r01.x*r02.x + r01.y*r02.y + r01.z*r02.z);
-  X_FLOAT r0112 = (r01.x*r12.x + r01.y*r12.y + r01.z*r12.z);
-  X_FLOAT r0212 = (r02.x*r12.x + r02.y*r12.y + r02.z*r12.z);
-
-  X_FLOAT quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
-  X_FLOAT quad1_0202 = invmass0*invmass0 * r02sq;
-  X_FLOAT quad1_1212 = invmass1*invmass1 * r12sq;
-  X_FLOAT quad1_0102 = X_F(2.0) * (invmass0+invmass1)*invmass0 * r0102;
-  X_FLOAT quad1_0112 = - X_F(2.0) * (invmass0+invmass1)*invmass1 * r0112;
-  X_FLOAT quad1_0212 = - X_F(2.0) * invmass0*invmass1 * r0212;
-
-  X_FLOAT quad2_0101 = invmass0*invmass0 * r01sq;
-  X_FLOAT quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
-  X_FLOAT quad2_1212 = invmass2*invmass2 * r12sq;
-  X_FLOAT quad2_0102 = X_F(2.0) * (invmass0+invmass2)*invmass0 * r0102;
-  X_FLOAT quad2_0112 = X_F(2.0) * invmass0*invmass2 * r0112;
-  X_FLOAT quad2_0212 = X_F(2.0) * (invmass0+invmass2)*invmass2 * r0212;
-
-  X_FLOAT quad3_0101 = invmass1*invmass1 * r01sq;
-  X_FLOAT quad3_0202 = invmass2*invmass2 * r02sq;
-  X_FLOAT quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq;
-  X_FLOAT quad3_0102 = - X_F(2.0) * invmass1*invmass2 * r0102;
-  X_FLOAT quad3_0112 = - X_F(2.0) * (invmass1+invmass2)*invmass1 * r0112;
-  X_FLOAT quad3_0212 = X_F(2.0) * (invmass1+invmass2)*invmass2 * r0212;
-  // iterate until converged
-
-  X_FLOAT lamda01 = X_F(0.0);
-  X_FLOAT lamda02 = X_F(0.0);
-  X_FLOAT lamda12 = X_F(0.0);
-  int niter = 0;
-  int done = 0;
-
-  X_FLOAT quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new;
-  
-//maybe all running full loop?
-  while (__any(!done) && niter < _max_iter) {
-    quad1 = quad1_0101 * lamda01*lamda01 + 
-      quad1_0202 * lamda02*lamda02 +
-      quad1_1212 * lamda12*lamda12 + 
-      quad1_0102 * lamda01*lamda02 +
-      quad1_0112 * lamda01*lamda12 +
-      quad1_0212 * lamda02*lamda12;
-
-    quad2 = quad2_0101 * lamda01*lamda01 + 
-      quad2_0202 * lamda02*lamda02 +
-      quad2_1212 * lamda12*lamda12 + 
-      quad2_0102 * lamda01*lamda02 +
-      quad2_0112 * lamda01*lamda12 +
-      quad2_0212 * lamda02*lamda12;
-      
-    quad3 = quad3_0101 * lamda01*lamda01 + 
-      quad3_0202 * lamda02*lamda02 +
-      quad3_1212 * lamda12*lamda12 + 
-      quad3_0102 * lamda01*lamda02 +
-      quad3_0112 * lamda01*lamda12 +
-      quad3_0212 * lamda02*lamda12;
-
-    b1 = bond1*bond1 - s01sq - quad1;
-    b2 = bond2*bond2 - s02sq - quad2;
-    b3 = bond12*bond12 - s12sq - quad3;
-        
-    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
-    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
-    lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3;
-
-    done++;
-    done = (fabs(lamda01_new-lamda01) > _tolerance)?0: done;
-    done = (fabs(lamda02_new-lamda02) > _tolerance)?0: done;
-    done = (fabs(lamda12_new-lamda12) > _tolerance)?0: done;
-
-    lamda01 = done<2?lamda01_new:lamda01;
-    lamda02 = done<2?lamda02_new:lamda02;
-    lamda12 = done<2?lamda12_new:lamda12;
-    niter++;
-  }
-  // update forces if atom is owned by this processor
-
-  lamda01 *= X_F(1.0)/_dtfsq;
-  lamda02 *= X_F(1.0)/_dtfsq;
-  lamda12 *= X_F(1.0)/_dtfsq;
-
-
-  //attenion: are shake clusters <-> atom unique?
-    nlist = 0;
-  if (i0 < _nlocal) {
-    _f[i0] 			+= lamda01*r01.x + lamda02*r02.x;
-    _f[i0+_nmax] 	+= lamda01*r01.y + lamda02*r02.y;
-    _f[i0+2*_nmax] 	+= lamda01*r01.z + lamda02*r02.z;
-    list[nlist++] = i0;
-  }
-
-  if (i1 < _nlocal) {
-    _f[i1] 			-= lamda01*r01.x - lamda12*r12.x;
-    _f[i1+_nmax] 	-= lamda01*r01.y - lamda12*r12.y;
-    _f[i1+2*_nmax] 	-= lamda01*r01.z - lamda12*r12.z;
-    list[nlist++] = i1;
-  }
-
-  if (i2 < _nlocal) {
-    _f[i2] 			-= lamda02*r02.x + lamda12*r12.x;
-    _f[i2+_nmax] 	-= lamda02*r02.y + lamda12*r12.y;
-    _f[i2+2*_nmax] 	-= lamda02*r02.z + lamda12*r12.z;
-    list[nlist++] = i2;
-  }
-
-  if (vflag||vflag_atom) {
-  	ENERGY_FLOAT* shared = &sharedmem[threadIdx.x];
-  	X_FLOAT factor=X_F(2.0)/X_F(3.0)*nlist;
-    v[0] = lamda01*r01.x*r01.x + lamda02*r02.x*r02.x + lamda12*r12.x*r12.x; *shared = factor*v[0]; shared+=blockDim.x; //times 2.0 since the reducing function is the same as in force calculations, which adds a factor 0.5
-    v[1] = lamda01*r01.y*r01.y + lamda02*r02.y*r02.y + lamda12*r12.y*r12.y; *shared = factor*v[1]; shared+=blockDim.x;
-    v[2] = lamda01*r01.z*r01.z + lamda02*r02.z*r02.z + lamda12*r12.z*r12.z; *shared = factor*v[2]; shared+=blockDim.x;
-    v[3] = lamda01*r01.x*r01.y + lamda02*r02.x*r02.y + lamda12*r12.x*r12.y; *shared = factor*v[3]; shared+=blockDim.x;
-    v[4] = lamda01*r01.x*r01.z + lamda02*r02.x*r02.z + lamda12*r12.x*r12.z; *shared = factor*v[4]; shared+=blockDim.x;
-    v[5] = lamda01*r01.y*r01.z + lamda02*r02.y*r02.z + lamda12*r12.y*r12.z; *shared = factor*v[5]; shared+=blockDim.x;
-    
-    v_tally(vflag,vflag_atom,nlist,list,3.0,v);
-  }
-}
-
-__global__ void FixShakeCuda_Shake_Kernel(int vflag,int vflag_atom,int* list,int nlist)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<nlist)
-  {
-
-    int m = list[i];
-    int sflag = _shake_flag[m];
-    if (sflag == 2) FixShakeCuda_Shake2(vflag,vflag_atom,m); 
-    else if(sflag == 3) FixShakeCuda_Shake3(vflag,vflag_atom,m); 
-    else if(sflag == 4) FixShakeCuda_Shake4(vflag,vflag_atom,m);
-    else FixShakeCuda_Shake3Angle(vflag,vflag_atom,m);
-  }
-  else
-  {
-  	ENERGY_FLOAT* shared=&sharedmem[threadIdx.x];
-  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
-  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
-  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
-  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
-  	*shared=ENERGY_F(0.0); shared+=blockDim.x;
-  	*shared=ENERGY_F(0.0);
-  }
-  if(vflag)
-  {
-  	__syncthreads();
-  	int eflag=0;
-  	PairVirialCompute_A_Kernel(eflag,vflag);
-  }
-    
-}
-
-__global__ void FixShakeCuda_PackComm_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-    int j=list[i];
-    if(j>_nmax) _flag[0]=1;
-    X_FLOAT3 xs=_xshake[j];
-    ((X_FLOAT*) _buffer)[i]=xs.x + dx;
-    ((X_FLOAT*) _buffer)[i+1*n] = xs.y + dy;
-    ((X_FLOAT*) _buffer)[i+2*n] = xs.z + dz;
-  }
-  
-}
-
-__global__ void FixShakeCuda_PackComm_Self_Kernel(int* sendlist,int n,int maxlistlength,int iswap,X_FLOAT dx,X_FLOAT dy,X_FLOAT dz,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int* list=sendlist+iswap*maxlistlength;
-  if(i<n)
-  {
-    int j=list[i];
-    if(j>_nmax) _flag[0]=1;
-    X_FLOAT3 xs=_xshake[j];
-    xs.x += dx;
-    xs.y += dy;
-    xs.z += dz;
-    _xshake[i+first]=xs;
-  }
-  
-}
-
-__global__ void FixShakeCuda_UnpackComm_Kernel(int n,int first)
-{
-  int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  if(i<n)
-  {
-    X_FLOAT3 xs;
-  xs.x=((X_FLOAT*) _buffer)[i];
-  xs.y=((X_FLOAT*) _buffer)[i+1*n];
-  xs.z=((X_FLOAT*) _buffer)[i+2*n];
-  _xshake[i+first]=xs;
-  }
-}
-
--- a/lib/cuda/fix_temp_berendsen_cuda.cu
+++ b/lib/cuda/fix_temp_berendsen_cuda.cu
@ -1,64 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_temp_berendsen_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_temp_berendsen_cuda_cu.h"
-#include "fix_temp_berendsen_cuda_kernel.cu"
-
-
-void Cuda_FixTempBerendsenCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
-}
-
-void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
-		
-}
-
-
-void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor)
-{
-	V_FLOAT factor=afactor;
-	if(sdata->atom.update_nmax) 
-		Cuda_FixTempBerendsenCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixTempBerendsenCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixTempBerendsenCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_temp_berendsen_cuda_cu.h
+++ b/lib/cuda/fix_temp_berendsen_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixTempBerendsenCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixTempBerendsenCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor);
--- a/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
+++ b/lib/cuda/fix_temp_berendsen_cuda_kernel.cu
@ -1,36 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-
-__global__ void Cuda_FixTempBerendsenCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-	  _v[i]*=factor;
-	  _v[i+_nmax]*=factor;
-	  _v[i+2*_nmax]*=factor;
-    }
-}
-
--- a/lib/cuda/fix_temp_rescale_cuda.cu
+++ b/lib/cuda/fix_temp_rescale_cuda.cu
@ -1,64 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_temp_rescale_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_temp_rescale_cuda_cu.h"
-#include "fix_temp_rescale_cuda_kernel.cu"
-
-
-void Cuda_FixTempRescaleCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
-}
-
-void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
-		
-}
-
-
-void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor)
-{
-	V_FLOAT factor=afactor;
-	//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
-		Cuda_FixTempRescaleCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		//cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixTempRescaleCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_temp_rescale_cuda_cu.h
+++ b/lib/cuda/fix_temp_rescale_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixTempRescaleCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixTempRescaleCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor);
--- a/lib/cuda/fix_temp_rescale_cuda_kernel.cu
+++ b/lib/cuda/fix_temp_rescale_cuda_kernel.cu
@ -1,36 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-
-__global__ void Cuda_FixTempRescaleCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-	  _v[i]*=factor;
-	  _v[i+_nmax]*=factor;
-	  _v[i+2*_nmax]*=factor;
-    }
-}
-
--- a/lib/cuda/fix_temp_rescale_limit_cuda.cu
+++ b/lib/cuda/fix_temp_rescale_limit_cuda.cu
@ -1,64 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_temp_rescale_limit_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_temp_rescale_limit_cuda_cu.h"
-#include "fix_temp_rescale_limit_cuda_kernel.cu"
-
-
-void Cuda_FixTempRescaleLimitCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
-}
-
-void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
-		
-}
-
-
-void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit)
-{
-	V_FLOAT factor=afactor;
-	//if(sdata->atom.update_nmax) //fix temp rescale is usually not called every timestep so it might miss an update step
-		Cuda_FixTempRescaleLimitCuda_UpdateNmax(sdata);
-	//if(sdata->atom.update_nlocal) 		
-		//cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	
-	int3 layout=getgrid(sdata->atom.nlocal);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel<<<grid, threads,0>>> (groupbit,factor,limit);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixTempRescaleLimitCuda_PostForce: fix add_force post_force compute Kernel execution failed");
-}
--- a/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixTempRescaleLimitCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixTempRescaleLimitCuda_EndOfStep(cuda_shared_data* sdata, int groupbit,double afactor,double limit);
--- a/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
+++ b/lib/cuda/fix_temp_rescale_limit_cuda_kernel.cu
@ -1,43 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-
-
-__global__ void Cuda_FixTempRescaleLimitCuda_EndOfStep_Kernel(int groupbit,V_FLOAT factor,V_FLOAT limit)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-	  V_FLOAT vx = _v[i];
-	  V_FLOAT vy = _v[i+_nmax];
-	  V_FLOAT vz = _v[i+2*_nmax];
-	  vx*=factor;
-	  vy*=factor;
-	  vz*=factor;
-	  
-	  _v[i]=vx>0?min(vx,limit):max(vx,-limit);
-	  _v[i+_nmax]=vy>0?min(vy,limit):max(vy,-limit);
-	  _v[i+2*_nmax]=vz>0?min(vz,limit):max(vz,-limit);
-    }
-}
-
--- a/lib/cuda/fix_viscous_cuda.cu
+++ b/lib/cuda/fix_viscous_cuda.cu
@ -1,66 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#define MY_PREFIX fix_viscous_cuda
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-
-#include "fix_viscous_cuda_cu.h"
-#include "fix_viscous_cuda_kernel.cu"
-
-void Cuda_FixViscousCuda_UpdateNmax(cuda_shared_data* sdata)
-{
-		cudaMemcpyToSymbol(MY_CONST(mask)    , & sdata->atom.mask .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*)     );
-}
-
-void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_FixViscousCuda_UpdateNmax(sdata);
-		
-}
-
-
-void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma)
-{
-	if(sdata->atom.update_nmax) 
-		Cuda_FixViscousCuda_UpdateNmax(sdata);
-	if(sdata->atom.update_nlocal) 		
-		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
-	
-	
-	int3 layout=getgrid(sdata->atom.nlocal,0);
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	Cuda_FixViscousCuda_PostForce_Kernel<<<grid, threads,0>>> (groupbit,(F_FLOAT*) gamma);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_Cuda_FixViscousCuda_PostForce: Kernel execution failed");
- 
-}
--- a/lib/cuda/fix_viscous_cuda_cu.h
+++ b/lib/cuda/fix_viscous_cuda_cu.h
@ -1,27 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-extern "C" void Cuda_FixViscousCuda_Init(cuda_shared_data* sdata);
-extern "C" void Cuda_FixViscousCuda_PostForce(cuda_shared_data* sdata, int groupbit,void* gamma);
--- a/lib/cuda/fix_viscous_cuda_kernel.cu
+++ b/lib/cuda/fix_viscous_cuda_kernel.cu
@ -1,35 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-__global__ void Cuda_FixViscousCuda_PostForce_Kernel(int groupbit,F_FLOAT* gamma)
-{
-   int i=(blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
- 
-   if(i < _nlocal)
-    if (_mask[i] & groupbit) {
-      F_FLOAT drag = gamma[_type[i]];
-      _f[i] -= drag*_v[i];
-      _f[i+1*_nmax] -= drag*_v[i+1*_nmax];
-      _f[i+2*_nmax] -= drag*_v[i+2*_nmax];
-    }
-}
--- a/lib/cuda/neighbor.cu
+++ b/lib/cuda/neighbor.cu
@ -1,367 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-#include <time.h>
-#define MY_PREFIX neighbor
-#define IncludeCommonNeigh
-#include "cuda_shared.h"
-#include "cuda_common.h"
-#include "crm_cuda_utils.cu"
-#include "cuda_wrapper_cu.h"
-
-#define _cutneighsq     MY_AP(cutneighsq)
-#define _ex_type     	MY_AP(ex_type)
-#define _nex_type     	MY_AP(nex_type)
-#define _ex1_bit     	MY_AP(ex1_bit)
-#define _ex2_bit     	MY_AP(ex2_bit)
-#define _nex_group     	MY_AP(nex_group)
-#define _ex_mol_bit     MY_AP(ex_mol_bit)
-#define _nex_mol     	MY_AP(nex_mol)
-__device__ __constant__ CUDA_FLOAT* _cutneighsq;
-__device__ __constant__ int* _ex_type;
-__device__ __constant__ int _nex_type;
-__device__ __constant__ int* _ex1_bit;
-__device__ __constant__ int* _ex2_bit;
-__device__ __constant__ int _nex_group;
-__device__ __constant__ int* _ex_mol_bit;
-__device__ __constant__ int _nex_mol;
-
-#include "neighbor_cu.h"
-#include "neighbor_kernel.cu"
-
-void Cuda_Neighbor_UpdateBuffer(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-	CUT_CHECK_ERROR("Cuda_PairLJCutCuda: before updateBuffer failed");
-
-	    int size=(unsigned)(sizeof(int)*20+sneighlist->bin_dim[0]*sneighlist->bin_dim[1]*sneighlist->bin_dim[2]*(sizeof(int)+sneighlist->bin_nmax*3*sizeof(CUDA_FLOAT)));
-		if(sdata->buffersize<size)
-		{
-			MYDBG(printf("Cuda_Neighbor Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
-			if(sdata->buffer!=NULL) CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer=CudaWrapper_AllocCudaData(size);
-			sdata->buffersize=size;
-			sdata->buffer_new++;
-			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
-		}
-		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
-	CUT_CHECK_ERROR("Cuda_PairLJCutCuda: updateBuffer failed");
-}
-
-int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-	if(sdata->buffer_new)
-		Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
-
-	// initialize only on first call
-	CUDA_FLOAT rez_bin_size[3] = 
-	{
-		(1.0 * sneighlist->bin_dim[0]-4.0) / (sdata->domain.subhi[0] - sdata->domain.sublo[0]),
-		(1.0 * sneighlist->bin_dim[1]-4.0) / (sdata->domain.subhi[1] - sdata->domain.sublo[1]),
-		(1.0 * sneighlist->bin_dim[2]-4.0) / (sdata->domain.subhi[2] - sdata->domain.sublo[2])
-	};
-
-	short init = 0;
-	if(! init)
-	{
-		init = 0;
-		cudaMemcpyToSymbol(MY_CONST(x)              , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(nall)         , & sdata->atom.nall                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(nmax)           , & sdata->atom.nmax                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(sublo)          ,   sdata->domain.sublo                 , sizeof(X_FLOAT)*3);
-	}
-	
-
-	int3 layout = getgrid(sdata->atom.nall); // sneighlist->inum
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-    
-    timespec starttime,endtime;
-    clock_gettime(CLOCK_REALTIME,&starttime);
-	
-	cudaMemset((int*) (sdata->buffer),0,sizeof(int)*(20+(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2]))+3*sizeof(CUDA_FLOAT)*(sneighlist->bin_dim[0])*(sneighlist->bin_dim[1])*(sneighlist->bin_dim[2])*(sneighlist->bin_nmax));
-	
-	Binning_Kernel<<<grid, threads>>> (sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],sneighlist->bin_dim[2],rez_bin_size[0],rez_bin_size[1],rez_bin_size[2]);
-	cudaThreadSynchronize();
-  
-    clock_gettime(CLOCK_REALTIME,&endtime);
-    sdata->cuda_timings.neigh_bin+=
-		endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-	
-	
-	int binning_error;
-	cudaMemcpy((void*) &binning_error,(void*) sdata->buffer,1*sizeof(int),cudaMemcpyDeviceToHost);
-	if(binning_error) 
-	{
-		sneighlist->bin_extraspace+=0.05;
-	}
-	else
-	{
-		MYDBG(printf("CUDA: binning successful\n");)
-	}
-	CUT_CHECK_ERROR("Cuda_Binning: binning Kernel execution failed");
-	return binning_error;
-}
-
-int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-		//Cuda_Neighbor_UpdateBuffer(sdata,sneighlist);
-	CUDA_FLOAT globcutoff=-1.0;
-
-	short init=0;
-	if(! init)
-	{
-		init = 1;
-		
-		// !! LAMMPS indexes atom types starting with 1 !!
-		
-		unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-				
-		unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
-
-		CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
-		//printf("Allocate: %i\n",nx);
-		sneighlist->cu_cutneighsq = (CUDA_FLOAT*) CudaWrapper_AllocCudaData(nx);
-		
-		if(sneighlist->cutneighsq)
-		{
-		    int cutoffsdiffer=0;
-		    double cutoff0 = sneighlist->cutneighsq[1][1];
-			for(int i=1; i<=sdata->atom.ntypes; ++i)
-			{
-				for(int j=1; j<=sdata->atom.ntypes; ++j)
-				{
-					acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]);
-					if((sneighlist->cutneighsq[i][j]-cutoff0)*(sneighlist->cutneighsq[i][j]-cutoff0)>1e-6) cutoffsdiffer++;
-				}
-			}
-			if(not cutoffsdiffer) globcutoff=(CUDA_FLOAT) cutoff0;
-		}
-		else
-		{
-			MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); )
-			return 0;
-		}
-		
-		int size = 100;
-		if(sdata->buffersize < size)
-		{
-			MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); )
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize = size;
-			sdata->buffer_new++;
-			MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); )
-		}
-		
-	    CudaWrapper_UploadCudaData(acutneighsq,sneighlist->cu_cutneighsq,nx);
-		cudaMemcpyToSymbol(MY_CONST(cutneighsq)       , &sneighlist->cu_cutneighsq       , sizeof(CUDA_FLOAT*) );
-		
-		cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(special_flag)     , sdata->atom.special_flag         , 4*sizeof(int)	);
-		cudaMemcpyToSymbol(MY_CONST(molecular)        , & sdata->atom.molecular          , sizeof(int)	);
-	}
-	
-	cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
-	//cudaMemcpyToSymbol(MY_CONST(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(inum)             , & sneighlist->inum               , sizeof(int)      );
-	cudaMemcpyToSymbol(MY_CONST(nlocal)           , & sdata->atom.nlocal             , sizeof(int)      );
-	cudaMemcpyToSymbol(MY_CONST(nall)             , & sdata->atom.nall            , sizeof(int)      );
-	cudaMemcpyToSymbol(MY_CONST(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(type)             , & sdata->atom.type      .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(mask)             , & sdata->atom.mask      .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(tag)              , & sdata->atom.tag       .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(special)          , & sdata->atom.special   .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(maxspecial)       , & sdata->atom.maxspecial         , sizeof(int)      );
-	cudaMemcpyToSymbol(MY_CONST(nspecial)         , & sdata->atom.nspecial  .dev_data, sizeof(int*)     );
-	cudaMemcpyToSymbol(MY_CONST(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int) );
-	cudaMemcpyToSymbol(MY_CONST(debugdata)        , & sdata->debugdata	 , sizeof(int*) );
-	cudaMemcpyToSymbol(MY_CONST(overlap_comm)     , & sdata->overlap_comm, sizeof(int) );
-	cudaMemcpyToSymbol(MY_CONST(neighbors) 		  , & sneighlist->neighbors.dev_data, sizeof(int*));
-	cudaMemcpyToSymbol(MY_CONST(ex_type) 		  , & sneighlist->ex_type.dev_data, sizeof(int*));
-	cudaMemcpyToSymbol(MY_CONST(ex1_bit) 		  , & sneighlist->ex1_bit.dev_data, sizeof(int*));
-	cudaMemcpyToSymbol(MY_CONST(ex2_bit) 		  , & sneighlist->ex2_bit.dev_data, sizeof(int*));
-	cudaMemcpyToSymbol(MY_CONST(ex_mol_bit) 	  , & sneighlist->ex_mol_bit.dev_data, sizeof(int*));
-	cudaMemcpyToSymbol(MY_CONST(nex_type)     	  , & sneighlist->nex_type, sizeof(int) );
-	cudaMemcpyToSymbol(MY_CONST(nex_group)     	  , & sneighlist->nex_group, sizeof(int) );
-	cudaMemcpyToSymbol(MY_CONST(nex_mol)     	  , & sneighlist->nex_mol, sizeof(int) );
-
-	if(sdata->overlap_comm) 
-	{
-		cudaMemcpyToSymbol(MY_CONST(numneigh_border)  , & sneighlist->numneigh_border .dev_data, sizeof(int*));
-		cudaMemcpyToSymbol(MY_CONST(numneigh_inner)   , & sneighlist->numneigh_inner  .dev_data, sizeof(int*));
-		cudaMemcpyToSymbol(MY_CONST(neighbors_border) , & sneighlist->neighbors_border.dev_data, sizeof(int*));
-		cudaMemcpyToSymbol(MY_CONST(neighbors_inner)  , & sneighlist->neighbors_inner .dev_data, sizeof(int*));
-		cudaMemcpyToSymbol(MY_CONST(ilist_border)     , & sneighlist->ilist_border    .dev_data, sizeof(int*));
-		cudaMemcpyToSymbol(MY_CONST(inum_border)      , & sneighlist->inum_border     .dev_data, sizeof(int*) );		
-	}
-
-	//dim3 threads(sneighlist->bin_nmax,1,1);
-	dim3 threads(MIN(128,sneighlist->bin_nmax),1,1);
-	dim3 grid(sneighlist->bin_dim[0]*sneighlist->bin_dim[1],sneighlist->bin_dim[2],1);
-
-	//printf("Configuration: %i %i %i %i %i\n",grid.x,grid.y,threads.x,(sizeof(int)+3*sizeof(X_FLOAT))*threads.x,sneighlist->bin_nmax);
-	int buffer[20];
-	buffer[0]=1;
-	buffer[1]=0;
-	CudaWrapper_UploadCudaData( buffer, sdata->buffer, 2*sizeof(int));
-	CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel error");
-	//cudaMemset(sdata->debugdata,0,100*sizeof(int));
-	unsigned int shared_size=(sizeof(int)+3*sizeof(CUDA_FLOAT))*threads.x;
-	MYDBG(printf("Configuration: %i %i %i %u %i\n",grid.x,grid.y,threads.x,shared_size,sneighlist->bin_nmax);)
-	//shared_size=2056;
-  timespec starttime,endtime;
-  clock_gettime(CLOCK_REALTIME,&starttime);
-	//for(int i=0;i<100;i++)
-	{
-	if(sdata->overlap_comm)
-	NeighborBuildFullBin_OverlapComm_Kernel<<<grid,threads,shared_size>>>
-		(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom);
-	else
-	{
-	  int exclude=sneighlist->nex_mol|sneighlist->nex_group|sneighlist->nex_type;			
-	  if(exclude)
-		NeighborBuildFullBin_Kernel<1><<<grid,threads,shared_size>>>
-		(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom,sdata->pair.neighall);
-	  else
-		NeighborBuildFullBin_Kernel<0><<<grid,threads,shared_size>>>
-		(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff,sdata->pair.use_block_per_atom,sdata->pair.neighall);
-	}
-	//NeighborBuildFullBin_Kernel_Restrict<<<grid,threads,(2*sizeof(int)+3*sizeof(X_FLOAT))*threads.x+sizeof(int)>>>
-	//	(sneighlist->binned_id,sneighlist->bin_nmax,sneighlist->bin_dim[0],sneighlist->bin_dim[1],globcutoff);
-	
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
-    clock_gettime(CLOCK_REALTIME,&endtime);
-    sdata->cuda_timings.neigh_build+=
-		endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-	//dim3 threads,grid;
-	CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int));
-	if(buffer[0]>=0&&true&&sdata->atom.molecular)
-	{
-		//printf("Find Special: %i %i\n",sneighlist->inum,sdata->atom.nall);
-  clock_gettime(CLOCK_REALTIME,&starttime);
-	int3 layout=getgrid(sdata->atom.nlocal,0,512); 
-	threads.x = layout.z; threads.y = 1; threads.z = 1;
-	grid.x = layout.x; grid.y = layout.y; grid.z = 1;
-	FindSpecial<<<grid,threads>>>(sdata->pair.use_block_per_atom);
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_NeighborBuild: FindSpecial kernel execution failed");
-  clock_gettime(CLOCK_REALTIME,&endtime);
-  sdata->cuda_timings.neigh_special+=
-		endtime.tv_sec-starttime.tv_sec+1.0*(endtime.tv_nsec-starttime.tv_nsec)/1000000000;
-	}
-	}
-		//printf("Neightime: %lf\n",sdata->cuda_timings.test1);
-	CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
-
-	//CudaWrapper_DownloadCudaData(buffer, sneighlist->numneigh_border .dev_data, sizeof(int));
-
-	MYDBG(printf("Cuda_NeighborBuildFullBin build neighbor list ... end\n");)
-	return buffer[0];
-}
-
-int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist)
-{
-	MYDBG(printf("Cuda_NeighborBuildFullNsq build neighbor list ... start\n");)
-	// initialize only on first call
-	/*static*/ short init=0;
-	if(! init)
-	{
-		init = 1;
-		
-		// !! LAMMPS indexes atom types starting with 1 !!
-		
-		unsigned cuda_ntypes = sdata->atom.ntypes + 1;
-		if(cuda_ntypes*cuda_ntypes > CUDA_MAX_TYPES2)
-			printf("# CUDA: Cuda_PairLJCutCuda_Init: you need %u types. this is more than %u "
-				"(assumed at compile time). re-compile with -DCUDA_MAX_TYPES_PLUS_ONE=32 "
-				"or ajust this in cuda_common.h\n", cuda_ntypes, CUDA_MAX_TYPES2);
-		
-		unsigned nx = sizeof(CUDA_FLOAT) * cuda_ntypes * cuda_ntypes;
-		CUDA_FLOAT* acutneighsq = (CUDA_FLOAT*) malloc(nx);
-		
-		if(sneighlist->cutneighsq)
-		{
-			for(int i=1; i<=sdata->atom.ntypes; ++i)
-			{
-				for(int j=1; j<=sdata->atom.ntypes; ++j)
-				{
-					acutneighsq[i * cuda_ntypes + j] = (CUDA_FLOAT) (sneighlist->cutneighsq[i][j]);
-					//printf("CUTOFFS: %i %i %i %e\n",i,j,cuda_ntypes,acutneighsq[i * cuda_ntypes + j]);
-				}
-			}
-		}
-		else
-		{
-			MYEMUDBG( printf("# CUDA: Cuda_NeighborBuild: cutneighsq == NULL\n"); )
-			return 0;
-		}
-		
-		int size = 100;
-		if(sdata->buffersize < size)
-		{
-			MYDBG( printf("Cuda_NeighborBuild Resizing Buffer at %p with %i kB to\n", sdata->buffer, sdata->buffersize); )
-			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
-			sdata->buffer = CudaWrapper_AllocCudaData(size);
-			sdata->buffersize = size;
-			sdata->buffer_new++;
-			MYDBG( printf("New buffer at %p with %i kB\n", sdata->buffer, sdata->buffersize); )
-		}
-		
-		cudaMemcpyToSymbol(MY_CONST(buffer)           , & sdata->buffer                  , sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)      , & cuda_ntypes                    , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(cutneighsq)       , acutneighsq                    , nx               );
-		cudaMemcpyToSymbol(MY_CONST(neighbor_maxlocal), & sneighlist->firstneigh.dim[0]  , sizeof(unsigned) );
-		cudaMemcpyToSymbol(MY_CONST(firstneigh)       , & sneighlist->firstneigh.dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(ilist)            , & sneighlist->ilist     .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(inum)             , & sneighlist->inum               , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nlocal)           , & sdata->atom.nlocal             , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nall)             , & sdata->atom.nall               , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(nmax)             , & sdata->atom.nmax               , sizeof(int)      );
-		cudaMemcpyToSymbol(MY_CONST(numneigh)         , & sneighlist->numneigh  .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(type)             , & sdata->atom.type      .dev_data, sizeof(int*)     );
-		cudaMemcpyToSymbol(MY_CONST(x)                , & sdata->atom.x         .dev_data, sizeof(X_FLOAT*) );
-		cudaMemcpyToSymbol(MY_CONST(maxneighbors)     , & sneighlist->maxneighbors	 , sizeof(int) );
-			
-		free(acutneighsq);
-	}
-
-	int3 layout = getgrid(sdata->atom.nlocal); // sneighlist->inum
-	dim3 threads(layout.z, 1, 1);
-	dim3 grid(layout.x, layout.y, 1);
-	
-	int return_value = 1;
-	CudaWrapper_UploadCudaData(& return_value, sdata->buffer, sizeof(int));
-	
-	CUT_CHECK_ERROR("Cuda_NeighborBuild: pre neighbor build kernel execution failed");
-	NeighborBuildFullNsq_Kernel<<<grid, threads>>> ();
-	cudaThreadSynchronize();
-	CUT_CHECK_ERROR("Cuda_NeighborBuild: neighbor build kernel execution failed");
-	
-	int buffer[20];
-	CudaWrapper_DownloadCudaData(buffer, sdata->buffer, sizeof(int)*20);
-	MYDBG(printf("Cuda_NeighborBuildFullNSQ build neighbor list ... end\n");)
-	return return_value=buffer[0];
-}
--- a/lib/cuda/neighbor_cu.h
+++ b/lib/cuda/neighbor_cu.h
@ -1,32 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#ifndef NEIGHBOR_CU_H_
-#define NEIGHBOR_CU_H_
-#include "cuda_shared.h"
-
-extern "C" int Cuda_BinAtoms(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
-extern "C" int Cuda_NeighborBuildFullBin(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
-extern "C" int Cuda_NeighborBuildFullNsq(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist);
-
-#endif /*NEIGHBOR_CU_H_*/
--- a/lib/cuda/neighbor_kernel.cu
+++ b/lib/cuda/neighbor_kernel.cu
@ -1,626 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#define SBBITS 30
-
-__global__ void Binning_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,int bin_dim_z,
-							   CUDA_FLOAT rez_bin_size_x,CUDA_FLOAT rez_bin_size_y,CUDA_FLOAT rez_bin_size_z)
-{
-	int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-	
-	/*int* bin_count=(int*) _buffer;
-	bin_count=bin_count+20;
-	CUDA_FLOAT* binned_x=(CUDA_FLOAT*)(bin_count+bin_dim_x*bin_dim_y*bin_dim_z);*/
-	CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
-	binned_x = &binned_x[2];
-	int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
-	if(i < _nall)
-	{
-		// copy atom position from global device memory to local register
-		// in this 3 steps to get as much coalesced access as possible
-		X_FLOAT* my_x = _x + i;
-		CUDA_FLOAT x_i = *my_x; my_x += _nmax;
-		CUDA_FLOAT y_i = *my_x; my_x += _nmax;
-		CUDA_FLOAT z_i = *my_x;
-		
-		
-		// calculate flat bin index
-		int bx=__float2int_rd(rez_bin_size_x * (x_i - _sublo[0]))+2;
-		int by=__float2int_rd(rez_bin_size_y * (y_i - _sublo[1]))+2;
-		int bz=__float2int_rd(rez_bin_size_z * (z_i - _sublo[2]))+2;
-
-		bx-=bx*negativCUDA(1.0f*bx);
-		bx-=(bx-bin_dim_x+1)*negativCUDA(1.0f*bin_dim_x-1.0f-1.0f*bx);
-		by-=by*negativCUDA(1.0f*by);
-		by-=(by-bin_dim_y+1)*negativCUDA(1.0f*bin_dim_y-1.0f-1.0f*by);
-		bz-=bz*negativCUDA(1.0f*bz);
-		bz-=(bz-bin_dim_z+1)*negativCUDA(1.0f*bin_dim_z-1.0f-1.0f*bz);
-		
-
-		const unsigned j = bin_dim_z * ( bin_dim_y *bx+by)+bz;
-		
-		// add new atom to bin, get bin-array position
-		const unsigned k = atomicAdd(& bin_count[j], 1);
-		if(k < bin_nmax)
-		{
-			binned_id [bin_nmax * j + k] = i;
-			binned_x [3 * bin_nmax * j + k] = x_i;
-			binned_x [3 * bin_nmax * j + k + bin_nmax] = y_i;
-			binned_x [3 * bin_nmax * j + k + 2*bin_nmax] = z_i;
-		}
-		else
-		{	// normally, this should not happen:
-			int errorn=atomicAdd((int*) _buffer, 1);
-			MYEMUDBG( printf("# CUDA: Binning_Kernel: WARNING: atom %i ignored, no place left in bin %u\n", i, j); )
-		}
-	}
-}
-
-
-__device__ inline int exclusion(int &i, int &j, int &itype, int &jtype)
-{
-  int m;
-
-  if (_nex_type)
-   if( _ex_type[itype * _cuda_ntypes + jtype]) return 1;
-
-  if (_nex_group) {
-    for (m = 0; m < _nex_group; m++) {
-      if (_mask[i] & _ex1_bit[m] && _mask[j] & _ex2_bit[m]) return 1;
-      if (_mask[i] & _ex2_bit[m] && _mask[j] & _ex1_bit[m]) return 1;
-    }
-  }
-
-  if (_nex_mol) {
-  	if(_molecule[i] == _molecule[j])
-    for (m = 0; m < _nex_mol; m++)
-      if (_mask[i] & _ex_mol_bit[m] && _mask[j] & _ex_mol_bit[m] ) return 1;
-  }
-
-  return 0;
-}
-
-extern __shared__ CUDA_FLOAT shared[];
-
-__device__ inline int find_special(int3 &n, int* list,int & tag,int3 flag)
-{
-  int k=n.z;
-  for (int l = 0; l < n.z; l++) k = ((list[l] == tag)?l:k);
-  
-  return k<n.x ? flag.x : (k<n.y? flag.y : (k<n.z?flag.z:0));
-}
-
-template <const unsigned int exclude>
-__global__ void NeighborBuildFullBin_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style, bool neighall)
-{
-  int natoms = neighall?_nall:_nlocal;
-	//const bool domol=false;
-	int bin_dim_z=gridDim.y;
-	CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
-	binned_x = &binned_x[2];
-	int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
-	int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y;
-	int bin_x = blockIdx.x/bin_dim_y;
-	int bin_y = blockIdx.x-bin_x*bin_dim_y;
-	int bin_z = blockIdx.y;
-	int bin_c = bin_count[bin];
-	
-	
-	CUDA_FLOAT cut;
-	if(globcutoff>0)
-	cut = globcutoff;
-	
-	int i=_nall;
-	CUDA_FLOAT* my_x;
-	CUDA_FLOAT x_i,y_i,z_i;
-
-    for(int actOffset=0; actOffset<bin_c; actOffset+=blockDim.x){
-    
-    int actIdx=threadIdx.x+actOffset;
-	CUDA_FLOAT* other_x=shared;
-	int* other_id=(int*) &other_x[3*blockDim.x];
-	
-	if(actIdx < bin_c)
-	{
-		i = binned_id[__mul24(bin,bin_nmax)+actIdx];
-		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+actIdx;
-		x_i = *my_x; my_x += bin_nmax;
-		y_i = *my_x; my_x += bin_nmax;
-		z_i = *my_x;
-	}
-	else
-	i=2*_nall;
-	__syncthreads();
-	
-	int jnum=0;
-	int itype;
-	
-	if(i<natoms)
-	{
-	   jnum = 0;
-	   _ilist[i]=i;
-	   itype = _type[i];
-	}
-    //__syncthreads();
-    
-  
-	for(int otherActOffset=0; otherActOffset<bin_c; otherActOffset+=blockDim.x){
-	int otherActIdx=threadIdx.x+otherActOffset;
-	if(otherActIdx<bin_c)
-	{
-	  if(otherActOffset==actOffset)
-	  {
-		other_id[threadIdx.x]=i;
-		other_x[threadIdx.x] = x_i;
-	    other_x[threadIdx.x+blockDim.x] = y_i; 
-		other_x[threadIdx.x+2*blockDim.x] = z_i; 
-	  }
-	  else
-	  {
-		other_id[threadIdx.x] = binned_id[__mul24(bin,bin_nmax)+otherActIdx];
-		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+otherActIdx;
-		other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
-		other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
-		other_x[threadIdx.x+__mul24(2,blockDim.x)] = *my_x;
-		
-	  }
-	}
-	__syncthreads();
-	int kk=threadIdx.x;
-	for(int k = 0; k < MIN(bin_c-otherActOffset,blockDim.x); ++k)
-	{
-		if(i<natoms)
-		{
-			kk++;
-			kk=kk<MIN(bin_c-otherActOffset,blockDim.x)?kk:0;
-			int j = other_id[kk];
-			if(exclude && exclusion(i,j,itype,_type[j])) continue;			
-			if(globcutoff<0)
-			{
-			  int jtype = _type[j];
-			  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-			}
-			CUDA_FLOAT delx = x_i - other_x[kk];
-			CUDA_FLOAT dely = y_i - other_x[kk+blockDim.x];
-			CUDA_FLOAT delz = z_i - other_x[kk+2*blockDim.x];
-			CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-
-				
-			if(rsq <= cut && i != j)
-			{
-		  	    if(jnum<_maxneighbors){
-		  	       	 if(block_style)
- 			         _neighbors[i*_maxneighbors+jnum]= j;
-		  	       	 else
- 			         _neighbors[i+jnum*natoms]= j;
- 			      }
- 			      ++jnum;
-			}
-		}
-	}
-			__syncthreads();
-	
-	}
-	
-	for(int obin_x=bin_x-1;obin_x<bin_x+2;obin_x++)
-	for(int obin_y=bin_y-1;obin_y<bin_y+2;obin_y++)
-	for(int obin_z=bin_z-1;obin_z<bin_z+2;obin_z++)
-	{
-		if(obin_x<0||obin_y<0||obin_z<0) continue;
-		if(obin_x>=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue;
-		int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z;
-		if(other_bin==bin) continue;
-		
-		int obin_c=bin_count[other_bin];
-		
-		for(int otherActOffset=0; otherActOffset<obin_c; otherActOffset+=blockDim.x){
-		int otherActIdx=otherActOffset+threadIdx.x;
-		if(threadIdx.x < MIN(blockDim.x,obin_c-otherActOffset))
-		{
-			other_id[threadIdx.x] = binned_id[__mul24(other_bin,bin_nmax)+otherActIdx];
-			my_x = binned_x + __mul24(__mul24(other_bin,3),bin_nmax)+otherActIdx;
-			other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
-			other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
-			other_x[threadIdx.x+2*blockDim.x] = *my_x;
-		}
-		__syncthreads();
-		
-		for(int k = 0; k < MIN(blockDim.x,obin_c-otherActOffset); ++k)
-		{
-			if(i<natoms)
-			{
-				int j = other_id[k];
-				if(exclude && exclusion(i,j,itype,_type[j])) continue;			
-				if(globcutoff<0)
-				{
-			  	  int jtype = _type[j];
-			  	  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-				}
-
-				CUDA_FLOAT delx = x_i - other_x[k];
-				CUDA_FLOAT dely = y_i - other_x[k+blockDim.x];
-				CUDA_FLOAT delz = z_i - other_x[k+2*blockDim.x];
-				CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-
-				if(rsq <= cut && i != j)
-				{
-			  	    if(jnum<_maxneighbors)
-			  	    {
-			  	      if(block_style)
-	 			      _neighbors[i*_maxneighbors+jnum]= j;
-			  	      else
-	 			      _neighbors[i+jnum*natoms]= j;
-			  	    }
-	 			      ++jnum;
-				}
-			}
-		}
-		__syncthreads();
-		
-		}
-	}
-	
-    if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum;
-
-	if(i<natoms)
-	_numneigh[i] = jnum;
-    }
-}
-
-
-__global__ void FindSpecial(int block_style)
-{
-  int ii = (blockIdx.x*gridDim.y+blockIdx.y)*blockDim.x+threadIdx.x;
-  int which;
-  int tag_mask=0;
-  int3 spec_flag;
-	
-  int3 mynspecial = {0,0,1};
-  if(ii>=_nlocal) return;
-  int special_id[CUDA_MAX_NSPECIAL];
-  
-  int i = _ilist[ii];
-  if(i>=_nlocal) return;
-  int jnum = _numneigh[i];
-  if (_special_flag[1] == 0) spec_flag.x = -1;
-  else if (_special_flag[1] == 1) spec_flag.x = 0;
-  else spec_flag.x = 1;
- 	  
-  if (_special_flag[2] == 0) spec_flag.y = -1;
-  else if (_special_flag[2] == 1) spec_flag.y = 0;
-  else spec_flag.y = 2;
-	  
-  if (_special_flag[3] == 0) spec_flag.z = -1;
-  else if (_special_flag[3] == 1) spec_flag.z = 0;
-  else spec_flag.z = 3;
- 
-  mynspecial.x=_nspecial[i];
-  mynspecial.y=_nspecial[i+_nmax];
-  mynspecial.z=_nspecial[i+2*_nmax];
-
-  if(i<_nlocal)
-  {
-	int* list = &_special[i];
-	for(int k=0;k<mynspecial.z;k++)
-	{
-	  special_id[k]=list[k*_nmax];
-  	  tag_mask = tag_mask|special_id[k];
-  	}
-  }	
-	  
-
-  for(int k=0;k<MIN(jnum,_maxneighbors);k++)
-  {
-  	int j;
-	if(block_style)
-	  j = _neighbors[i*_maxneighbors+k];
-	else
-	  j = _neighbors[i+k*_nlocal];
-	int tag_j=_tag[j];
-	which=0;
-	if((tag_mask&tag_j)==tag_j)
-	{
-	  which = find_special(mynspecial,special_id,tag_j,spec_flag);
-   	  if(which>0)
-   	  {
-   	    if(block_style)
-	      _neighbors[i*_maxneighbors+k]=j ^ (which << SBBITS);
-	    else
-	      _neighbors[i+k*_nlocal]=j ^ (which << SBBITS);
-   	  }
-   	  else if(which<0)
-   	  {
-   	    if(block_style)
-	  	  _neighbors[i*_maxneighbors+k]=_neighbors[i*_maxneighbors+jnum-1];
-		else
-	  	  _neighbors[i+k*_nlocal]=_neighbors[i+(jnum-1)*_nlocal];  
-	  	jnum--; 
-	  	k--;		
-   	  }
-    }
-  }
-  _numneigh[i]=jnum;
-}
-
-__global__ void NeighborBuildFullBin_OverlapComm_Kernel(int* binned_id,int bin_nmax,int bin_dim_x,int bin_dim_y,CUDA_FLOAT globcutoff,int block_style)
-{
-	int bin_dim_z=gridDim.y;
-	CUDA_FLOAT* binned_x=(CUDA_FLOAT*) _buffer;
-	binned_x = &binned_x[2];
-	int* bin_count=(int*) &binned_x[3*bin_dim_x*bin_dim_y*bin_dim_z*bin_nmax];
-	int bin = __mul24(gridDim.y,blockIdx.x)+blockIdx.y;
-	int bin_x = blockIdx.x/bin_dim_y;
-	int bin_y = blockIdx.x-bin_x*bin_dim_y;
-	int bin_z = blockIdx.y;
-	int bin_c = bin_count[bin];
-	
-	
-	CUDA_FLOAT cut;
-	if(globcutoff>0)
-	cut = globcutoff;
-	
-	int i=_nall;
-	CUDA_FLOAT* my_x;
-	CUDA_FLOAT x_i,y_i,z_i;
- 
-    for(int actOffset=0; actOffset<bin_c; actOffset+=blockDim.x){
-    
-    int actIdx=threadIdx.x+actOffset;
-	CUDA_FLOAT* other_x=shared;
-	int* other_id=(int*) &other_x[3*blockDim.x];
-	
-	if(actIdx < bin_c)
-	{
-		i = binned_id[__mul24(bin,bin_nmax)+actIdx];
-		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+actIdx;
-		x_i = *my_x; my_x += bin_nmax;
-		y_i = *my_x; my_x += bin_nmax;
-		z_i = *my_x;
-	}
-	else
-	i=2*_nall;
-	__syncthreads();
-	
-	int jnum=0;
-	int jnum_border=0;
-	int jnum_inner=0;
-	int i_border=-1;
-	int itype;
-	
-	if(i<_nlocal)
-	{
-	   jnum = 0;
-	   _ilist[i]=i;
-	   itype = _type[i];
-	}
-    __syncthreads();
-    
-
-	for(int otherActOffset=0; otherActOffset<bin_c; otherActOffset+=blockDim.x){
-	int otherActIdx=threadIdx.x+otherActOffset;
-	if(otherActIdx<bin_c)
-	{
-	  if(otherActOffset==actOffset)
-	  {
-		other_id[threadIdx.x]=i;
-		other_x[threadIdx.x] = x_i;
-	    other_x[threadIdx.x+blockDim.x] = y_i; 
-		other_x[threadIdx.x+2*blockDim.x] = z_i; 
-	  }
-	  else
-	  {
-		other_id[threadIdx.x] = binned_id[__mul24(bin,bin_nmax)+otherActIdx];
-		my_x = binned_x + __mul24(__mul24(bin,3),bin_nmax)+otherActIdx;
-		other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
-		other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
-		other_x[threadIdx.x+__mul24(2,blockDim.x)] = *my_x;
-		
-	  }
-	}
-	__syncthreads();
-	int kk=threadIdx.x;
-
-	for(int k = 0; k < MIN(bin_c-otherActOffset,blockDim.x); ++k)
-	{
-		if(i<_nlocal)
-		{
-			kk++;
-			kk=kk<MIN(bin_c-otherActOffset,blockDim.x)?kk:0;
-			int j = other_id[kk];
-
-			if(globcutoff<0)
-			{
-			  int jtype = _type[j];
-			  cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-			}
-			
-			CUDA_FLOAT delx = x_i - other_x[kk];
-			CUDA_FLOAT dely = y_i - other_x[kk+blockDim.x];
-			CUDA_FLOAT delz = z_i - other_x[kk+2*blockDim.x];
-			CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-
-				
-			if(rsq <= cut && i != j)
-			{
-	 			  if((j>=_nlocal)&&(i_border<0))
-	 			    i_border=atomicAdd(_inum_border,1);
-
-			  	    if(jnum<_maxneighbors)
-			  	    {
-		  	       	  if(block_style)
- 			          {
- 			            _neighbors[i*_maxneighbors+jnum]= j;
-	 			        if(j>=_nlocal)
-	 			           {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;}
-	 			        else
-	 			           {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;}
- 			          }
-		  	       	  else
-		  	       	  {
-	 			        _neighbors[i+jnum*_nlocal]=j;
-	 			        if(j>=_nlocal)
-	 			           {_neighbors_border[i_border+jnum_border*_nlocal]=j;}
-	 			        else
-	 			           {_neighbors_inner[i+jnum_inner*_nlocal]=j;}
-		  	       	  }
-			  	    }	
-	 			    ++jnum;
-   			        if(j>=_nlocal)
-	 			       jnum_border++;
-	 			    else
-	 			       jnum_inner++;
-			}
-		}
-	  }
-	  __syncthreads();
-	}
-	for(int obin_x=bin_x-1;obin_x<bin_x+2;obin_x++)
-	for(int obin_y=bin_y-1;obin_y<bin_y+2;obin_y++)
-	for(int obin_z=bin_z-1;obin_z<bin_z+2;obin_z++)
-	{
-		if(obin_x<0||obin_y<0||obin_z<0) continue;
-		if(obin_x>=bin_dim_x||obin_y>=bin_dim_y||obin_z>=bin_dim_z) continue;
-		int other_bin=bin_dim_z * ( bin_dim_y * obin_x + obin_y) + obin_z;
-		if(other_bin==bin) continue;
-		
-		int obin_c=bin_count[other_bin];
-		
-		for(int otherActOffset=0; otherActOffset<obin_c; otherActOffset+=blockDim.x){
-		int otherActIdx=otherActOffset+threadIdx.x;
-		if(threadIdx.x < MIN(blockDim.x,obin_c-otherActOffset))
-		{
-			other_id[threadIdx.x] = binned_id[__mul24(other_bin,bin_nmax)+otherActIdx];
-			my_x = binned_x + __mul24(__mul24(other_bin,3),bin_nmax)+otherActIdx;
-			other_x[threadIdx.x] = *my_x; my_x += bin_nmax;
-			other_x[threadIdx.x+blockDim.x] = *my_x; my_x += bin_nmax;
-			other_x[threadIdx.x+2*blockDim.x] = *my_x;
-		}
-		__syncthreads();
-		
-		for(int k = 0; k < MIN(blockDim.x,obin_c-otherActOffset); ++k)
-		{
-			if(i<_nlocal)
-			{
-				int j = other_id[k];
-				if(globcutoff<0)
-				{
-			  		int jtype = _type[j];
-			  		cut = _cutneighsq[itype * _cuda_ntypes + jtype];
-				}
-
-				CUDA_FLOAT delx = x_i - other_x[k];
-				CUDA_FLOAT dely = y_i - other_x[k+blockDim.x];
-				CUDA_FLOAT delz = z_i - other_x[k+2*blockDim.x];
-				CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-
-				if(rsq <= cut && i != j)
-				{
-	 			  if((j>=_nlocal)&&(i_border<0))
-	 			    i_border=atomicAdd(_inum_border,1);
-			  	    if(jnum<_maxneighbors)
-			  	    {
-		  	       	  if(block_style)
- 			          {
- 			            _neighbors[i*_maxneighbors+jnum]= j;
-	 			        if(j>=_nlocal)
-	 			           {_neighbors_border[i_border*_maxneighbors+jnum_border]=j;}
-	 			        else
-	 			           {_neighbors_inner[i*_maxneighbors+jnum_inner]=j;}
- 			          }
-		  	       	  else
-		  	       	  {
-	 			        _neighbors[i+jnum*_nlocal]=j;
-	 			        if(j>=_nlocal)
-	 			           {_neighbors_border[i_border+jnum_border*_nlocal]=j;}
-	 			        else
-	 			           {_neighbors_inner[i+jnum_inner*_nlocal]=j;}
-		  	       	  }
-			  	    }	
-	 			    ++jnum;
-   			        if(j>=_nlocal)
-	 			       jnum_border++;
-	 			    else
-	 			       jnum_inner++;
-				}
-			}
-		  }
-		  __syncthreads();
-		}
-	}
-	
-    if(jnum > _maxneighbors) ((int*)_buffer)[0] = -jnum;
- 
-	if(i<_nlocal)
-	{
- 	  _numneigh[i] = jnum;
-	  _numneigh_inner[i] = jnum_inner;
-	  if(i_border>=0) _numneigh_border[i_border] = jnum_border;
-	  if(i_border>=0) _ilist_border[i_border] = i;
-		  
-	}
-  }
-}
-
-__global__ void NeighborBuildFullNsq_Kernel()
-{
-	int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-	int* buffer = (int*) _buffer;
-	
-	if(i < _nlocal)
-	{
-		X_FLOAT* my_x = _x + i;
-		CUDA_FLOAT x_i = *my_x; my_x += _nmax;
-		CUDA_FLOAT y_i = *my_x; my_x += _nmax;
-		CUDA_FLOAT z_i = *my_x;
-		int jnum = 0;
-		int* jlist = _firstneigh[i];
-		_ilist[i]=i;
-		
-		int itype = _type[i];
-		__syncthreads();
-		for(int j = 0; j < _nall; ++j)
-		{
-			my_x = _x + j;
-			CUDA_FLOAT x_j = *my_x; my_x += _nmax;
-			CUDA_FLOAT y_j = *my_x; my_x += _nmax;
-			CUDA_FLOAT z_j = *my_x;
-			CUDA_FLOAT delx = x_i - x_j;
-			CUDA_FLOAT dely = y_i - y_j;
-			CUDA_FLOAT delz = z_i - z_j;
-			CUDA_FLOAT rsq = delx*delx + dely*dely + delz*delz;
-			int jtype = _type[j];
-			if(rsq <= _cutneighsq[itype * _cuda_ntypes + jtype] && i != j)
-			{
-				if(jnum<_maxneighbors)
-				jlist[jnum] = j;
-				if(i==151) ((int*)_buffer)[jnum+2]=j;
-				++jnum;
-			}
-			__syncthreads();
-		}
-	    if(jnum > _maxneighbors) buffer[0] = 0;
-		_numneigh[i] = jnum;
-				if(i==151) ((int*)_buffer)[1]=jnum;
-	}
-}
-
--- a/lib/cuda/pair_born_coul_long_cuda.cu
+++ b/lib/cuda/pair_born_coul_long_cuda.cu
@ -1,78 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-
-#define _rhoinv MY_AP(coeff1)
-#define _sigma MY_AP(coeff2)
-#define _a MY_AP(coeff3)
-#define _c MY_AP(coeff4)
-#define _d MY_AP(coeff5)
-
-#include "pair_born_coul_long_cuda_cu.h"
-#include "pair_born_coul_long_cuda_kernel_nc.cu"
-
-#include <time.h>
-
-void Cuda_PairBornCoulLongCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_Pair_Init_AllStyles(sdata, 5,true);
-}
-
-void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
-{
-	
-
-	static  short init=0;
-	if(! init)
-	{
-		init = 1;
-		Cuda_PairBornCoulLongCuda_Init(sdata);
-	}
-
-	dim3 grid,threads;
-	int sharedperproc;
-	
-	
-	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
-
-	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
-
-	if(sdata->pair.use_block_per_atom)
-		Pair_Kernel_BpA<PAIR_BORN,COUL_LONG,DATA_NONE>
-		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
-	else
-		Pair_Kernel_TpA<PAIR_BORN,COUL_LONG,DATA_NONE>
-		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
-
-	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
-}
-
-
-
-#undef _rhoinv
-#undef _sigma
-#undef _a
-#undef _c
-#undef _d
-
--- a/lib/cuda/pair_born_coul_long_cuda_cu.h
+++ b/lib/cuda/pair_born_coul_long_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-#ifdef CUDA_USE_BINNING
-extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, int eflag, int vflag);
-#else
-extern "C" void Cuda_PairBornCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
-#endif
--- a/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
+++ b/lib/cuda/pair_born_coul_long_cuda_kernel_nc.cu
@ -1,34 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-__device__ inline F_FLOAT PairBornCuda_Eval(const F_FLOAT& rsq,const int ij_type,F_FLOAT& factor_lj,int& eflag, ENERGY_FLOAT& evdwl)
-{
-    const F_FLOAT r2inv = F_F(1.0)/rsq;
-   	const F_FLOAT r = _RSQRT_(r2inv);
-	const F_FLOAT r6inv = r2inv*r2inv*r2inv;
-	const F_FLOAT rexp = _EXP_((_sigma[ij_type]-r)*_rhoinv[ij_type]);
-	const F_FLOAT forceborn = _a[ij_type]*_rhoinv[ij_type]*r*rexp - 
-					F_F(6.0)*_c[ij_type]*r6inv + F_F(8.0)*_d[ij_type]*r2inv*r6inv;
-    if(eflag) evdwl += factor_lj*(_a[ij_type]*rexp - _c[ij_type]*r6inv 
-			    					+_d[ij_type]*r2inv*r6inv-_offset[ij_type]);
-	return factor_lj*forceborn*r2inv;	
-}
--- a/lib/cuda/pair_buck_coul_cut_cuda.cu
+++ b/lib/cuda/pair_buck_coul_cut_cuda.cu
@ -1,74 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-
-#define _rhoinv MY_AP(coeff1)
-#define _buck1 MY_AP(coeff2)
-#define _buck2 MY_AP(coeff3)
-#define _a MY_AP(coeff4)
-#define _c MY_AP(coeff5)
-
-#include "pair_buck_coul_cut_cuda_cu.h"
-
-#include <time.h>
-void Cuda_PairBuckCoulCutCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_Pair_Init_AllStyles(sdata, 5,true);
-}
-
-void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
-{
-	
-
-	static  short init=0;
-	if(! init)
-	{
-		init = 1;
-		Cuda_PairBuckCoulCutCuda_Init(sdata);
-	}
-
-	dim3 grid,threads;
-	int sharedperproc;
-	
-	
-	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
-
-	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
-	if(sdata->pair.use_block_per_atom)
-		Pair_Kernel_BpA<PAIR_BUCK,COUL_CUT,DATA_NONE>
-		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
-	else
-		Pair_Kernel_TpA<PAIR_BUCK,COUL_CUT,DATA_NONE>
-		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
-
-	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
-}
-
-
-#undef _rhoinv
-#undef _buck1
-#undef _buck2
-#undef _a
-#undef _c
-
--- a/lib/cuda/pair_buck_coul_cut_cuda_cu.h
+++ b/lib/cuda/pair_buck_coul_cut_cuda_cu.h
@ -1,30 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include "cuda_shared.h"
-
-#ifdef CUDA_USE_BINNING
-extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, int eflag, int vflag);
-#else
-extern "C" void Cuda_PairBuckCoulCutCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom);
-#endif
--- a/lib/cuda/pair_buck_coul_long_cuda.cu
+++ b/lib/cuda/pair_buck_coul_long_cuda.cu
@ -1,77 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
-
-   Original Version:
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov 
-
-   See the README file in the top-level LAMMPS directory. 
-
-   ----------------------------------------------------------------------- 
-
-   USER-CUDA Package and associated modifications:
-   https://sourceforge.net/projects/lammpscuda/ 
-
-   Christian Trott, christian.trott@tu-ilmenau.de
-   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
-   Theoretical Physics II, University of Technology Ilmenau, Germany 
-
-   See the README file in the USER-CUDA directory. 
-
-   This software is distributed under the GNU General Public License.
------------------------------------------------------------------------- */
-
-#include <stdio.h>
-
-#define _rhoinv MY_AP(coeff1)
-#define _buck1 MY_AP(coeff2)
-#define _buck2 MY_AP(coeff3)
-#define _a MY_AP(coeff4)
-#define _c MY_AP(coeff5)
-
-#include "pair_buck_coul_long_cuda_cu.h"
-
-#include <time.h>
-
-void Cuda_PairBuckCoulLongCuda_Init(cuda_shared_data* sdata)
-{
-	Cuda_Pair_Init_AllStyles(sdata, 5,true);
-}
-
-void Cuda_PairBuckCoulLongCuda(cuda_shared_data* sdata, cuda_shared_neighlist* sneighlist, int eflag, int vflag,int eflag_atom,int vflag_atom)
-{
-	
-
-	static  short init=0;
-	if(! init)
-	{
-		init = 1;
-		Cuda_PairBuckCoulLongCuda_Init(sdata);
-	}
-
-	dim3 grid,threads;
-	int sharedperproc;
-	
-	
-	Cuda_Pair_PreKernel_AllStyles(sdata, sneighlist, eflag, vflag, grid, threads, sharedperproc,true,192);
-
-	cudaStream_t* streams = (cudaStream_t*) CudaWrapper_returnStreams();
-	if(sdata->pair.use_block_per_atom)
-		Pair_Kernel_BpA<PAIR_BUCK,COUL_LONG,DATA_NONE>
-		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
-	else
-		Pair_Kernel_TpA<PAIR_BUCK,COUL_LONG,DATA_NONE>
-		<<<grid, threads,sharedperproc*sizeof(ENERGY_FLOAT)*threads.x,streams[1]>>> (eflag, vflag,eflag_atom,vflag_atom);
-
-	Cuda_Pair_PostKernel_AllStyles(sdata, grid, sharedperproc, eflag, vflag);
-}
-
-
-
-
-#undef _rhoinv
-#undef _buck1
-#undef _buck2
-#undef _a
-#undef _c
-
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`extern "C" void Cuda_Cuda_GetCompileSettings(cuda_shared_data* sdata);`