git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6261 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-31 21:09:14 +00:00 · 2011-05-31 21:09:14 +00:00 · 2ea09945bf
parent 3692e8c68b
commit 2ea09945bf
121 changed files with 22597 additions and 16 deletions
--- a/src/USER-CUDA/Install.sh
+++ b/src/USER-CUDA/Install.sh
@ -4,31 +4,189 @@

 if (test $1 = 1) then

-  if (test -e ../Makefile.package) then
-      sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
-      sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
-      sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
-      sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
-      sed -i '1 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
-      sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
-      sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) |' ../Makefile.package
-      sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda -lcuda -lcudart -lrt |' ../Makefile.package
+  if (test ! -e ../Makefile.package) then
+    cp ../Makefile.package.empty ../Makefile.package
  fi

+  sed -i -e '/^include.*cuda.*$/d' ../Makefile.package
+  sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package
+  sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package
+  sed -i -e 's/[^ \t]*lrt[^ \t]* //g' ../Makefile.package
+  sed -i '4 i include ..\/..\/lib\/cuda\/Makefile.common' ../Makefile.package
+  sed -i -e 's|^PKG_INC =[ \t]*|&-I..\/..\/lib\/cuda -DLMP_USER_CUDA |' ../Makefile.package
+  sed -i -e 's|^PKG_PATH =[ \t]*|&-L..\/..\/lib\/cuda |' ../Makefile.package
+  sed -i -e 's|^PKG_LIB =[ \t]*|&-llammpscuda |' ../Makefile.package
+  sed -i -e 's|^PKG_SYSINC =[ \t]*|&-I$(CUDA_INSTALL_PATH)\/include |' ../Makefile.package
+  sed -i -e 's|^PKG_SYSPATH =[ \t]*|&-L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(CUDA_USRLIB_CONDITIONAL) |' ../Makefile.package
+  sed -i -e 's|^PKG_SYSLIB =[ \t]*|&-lcuda -lcudart -lrt |' ../Makefile.package
+ 
+  if (test -e ../atom_vec_angle.cpp) then
+    cp atom_vec_angle_cuda.cpp ..
+    cp atom_vec_angle_cuda.h ..
+  fi
+
+  if (test -e ../atom_vec_full.cpp) then
+    cp atom_vec_full_cuda.cpp ..
+    cp atom_vec_full_cuda.h ..
+  fi
+
+  if (test -e ../fix_freeze.cpp) then
+    cp fix_freeze_cuda.cpp ..
+    cp fix_freeze_cuda.h ..
+  fi
+
+  if (test -e ../pair_born_coul_long.cpp) then
+    cp pair_born_coul_long_cuda.cpp ..
+    cp pair_born_coul_long_cuda.h ..
+  fi
+
+  if (test -e ../pair_buck_coul_long.cpp) then
+    cp pair_buck_coul_long_cuda.cpp ..
+    cp pair_buck_coul_long_cuda.h ..
+  fi
+
+  if (test -e ../pair_cg_cmm.cpp) then
+    cp pair_cg_cmm_cuda.cpp ..
+    cp pair_cg_cmm_coul_cut_cuda.cpp ..
+    cp pair_cg_cmm_coul_debye_cuda.cpp ..
+    cp pair_cg_cmm_cuda.h ..
+    cp pair_cg_cmm_coul_cut_cuda.h ..
+    cp pair_cg_cmm_coul_debye_cuda.h ..
+  fi
+
+  if (test -e ../pair_cg_cmm_coul_long.cpp) then
+    cp pair_cg_cmm_coul_long_cuda.cpp ..
+    cp pair_cg_cmm_coul_long_cuda.h ..
+  fi
+
+  if (test -e ../pppm.cpp) then
+    cp pppm_cuda.cpp ..
+    cp fft3d_cuda.cpp ..
+    cp fft3d_wrap_cuda.cpp ..
+    cp pppm_cuda.h ..
+    cp fft3d_cuda.h ..
+    cp fft3d_wrap_cuda.h ..
+    cp pair_lj_cut_coul_long_cuda.cpp ..
+    cp pair_lj_cut_coul_long_cuda.h ..
+  fi
+  
+
+  if (test -e ../pair_eam.cpp) then
+    cp pair_eam_alloy_cuda.cpp ..
+    cp pair_eam_cuda.cpp ..
+    cp pair_eam_fs_cuda.cpp ..
+    cp pair_eam_alloy_cuda.h ..
+    cp pair_eam_cuda.h ..
+    cp pair_eam_fs_cuda.h ..
+  fi
+
+  if (test -e ../pair_gran_hooke.cpp) then
+    cp pair_gran_hooke_cuda.cpp ..
+    cp pair_gran_hooke_cuda.h ..
+  fi
+
+  if (test -e ../pair_lj_charmm_coul_charmm.cpp) then
+    cp pair_lj_charmm_coul_charmm_cuda.cpp ..
+    cp pair_lj_charmm_coul_charmm_implicit_cuda.cpp ..
+    cp pair_lj_charmm_coul_charmm_cuda.h ..
+    cp pair_lj_charmm_coul_charmm_implicit_cuda.h ..
+    if (test -e ../pair_lj_charmm_coul_long.cpp) then  
+      cp pair_lj_charmm_coul_long_cuda.cpp ..
+      cp pair_lj_charmm_coul_long_cuda.h ..
+    fi
+  fi
+
+  if (test -e ../pair_lj_class2.cpp) then
+    cp pair_lj_class2_coul_cut_cuda.cpp ..
+    cp pair_lj_class2_cuda.cpp ..
+    cp pair_lj_class2_coul_cut_cuda.h ..
+    cp pair_lj_class2_cuda.h ..
+    if (test -e ../pair_lj_class2_coul_long.cpp) then
+      cp pair_lj_class2_coul_long_cuda.cpp ..
+      cp pair_lj_class2_coul_long_cuda.h ..
+    fi
+  fi 
+
+  cp atom_vec_atomic_cuda.cpp ..
+  cp atom_vec_charge_cuda.cpp ..
  cp comm_cuda.cpp ..
+  cp compute_pe_cuda.cpp ..
+  cp compute_pressure_cuda.cpp ..
+  cp compute_temp_cuda.cpp ..
+  cp compute_temp_partial_cuda.cpp ..
  cp domain_cuda.cpp ..
+  cp fix_addforce_cuda.cpp ..
+  cp fix_aveforce_cuda.cpp ..
+  cp fix_enforce2d_cuda.cpp ..
+  cp fix_gravity_cuda.cpp ..
+  cp fix_nh_cuda.cpp ..
+  cp fix_npt_cuda.cpp ..
+  cp fix_nve_cuda.cpp ..
+  cp fix_nvt_cuda.cpp ..
+  cp fix_set_force_cuda.cpp ..
+  cp fix_shake_cuda.cpp ..
+  cp fix_temp_berendsen_cuda.cpp ..
+  cp fix_temp_rescale_cuda.cpp ..
+  cp fix_temp_rescale_limit_cuda.cpp ..
+  cp fix_viscous_cuda.cpp ..
  cp modify_cuda.cpp ..
  cp neighbor_cuda.cpp ..
  cp neigh_full_cuda.cpp ..
+  cp pair_buck_coul_cut_cuda.cpp ..
+  cp pair_buck_cuda.cpp ..
+  cp pair_lj96_cut_cuda.cpp ..
+  cp pair_lj_cut_coul_cut_cuda.cpp ..
+  cp pair_lj_cut_coul_debye_cuda.cpp ..
+  cp pair_lj_cut_cuda.cpp ..
+  cp pair_lj_cut_experimental_cuda.cpp ..
+  cp pair_lj_expand_cuda.cpp ..
+  cp pair_lj_gromacs_coul_gromacs_cuda.cpp ..
+  cp pair_lj_gromacs_cuda.cpp ..
+  cp pair_lj_smooth_cuda.cpp ..
+  cp pair_morse_cuda.cpp ..
+  cp pppm_cuda.cpp ..
  cp verlet_cuda.cpp ..

  cp cuda.cpp ..
  cp cuda_neigh_list.cpp ..

+  cp atom_vec_atomic_cuda.h ..
+  cp atom_vec_charge_cuda.h ..
  cp comm_cuda.h ..
+  cp compute_pe_cuda.h ..
+  cp compute_pressure_cuda.h ..
+  cp compute_temp_cuda.h ..
+  cp compute_temp_partial_cuda.h ..
  cp domain_cuda.h ..
+  cp fix_addforce_cuda.h ..
+  cp fix_aveforce_cuda.h ..
+  cp fix_enforce2d_cuda.h ..
+  cp fix_gravity_cuda.h ..
+  cp fix_nh_cuda.h ..
+  cp fix_npt_cuda.h ..
+  cp fix_nve_cuda.h ..
+  cp fix_nvt_cuda.h ..
+  cp fix_set_force_cuda.h ..
+  cp fix_shake_cuda.h ..
+  cp fix_temp_berendsen_cuda.h ..
+  cp fix_temp_rescale_cuda.h ..
+  cp fix_temp_rescale_limit_cuda.h ..
+  cp fix_viscous_cuda.h ..
  cp modify_cuda.h ..
  cp neighbor_cuda.h ..
+  cp pair_buck_coul_cut_cuda.h ..
+  cp pair_buck_cuda.h ..
+
+  cp pair_lj96_cut_cuda.h ..
+  cp pair_lj_cut_coul_cut_cuda.h ..
+  cp pair_lj_cut_coul_debye_cuda.h ..
+  cp pair_lj_cut_cuda.h ..
+  cp pair_lj_cut_experimental_cuda.h ..
+  cp pair_lj_expand_cuda.h ..
+  cp pair_lj_gromacs_coul_gromacs_cuda.h ..
+  cp pair_lj_gromacs_cuda.h ..
+  cp pair_lj_smooth_cuda.h ..
+  cp pair_morse_cuda.h ..
  cp verlet_cuda.h ..

  cp cuda.h ..
@ -42,26 +200,136 @@ if (test $1 = 1) then
 elif (test $1 = 0) then

  if (test -e ../Makefile.package) then
-    sed -i -e '/include ..\/..\/lib\/cuda\/Makefile.common/d' ../Makefile.package
-    sed -i -e 's/-llammpscuda -lcuda -lcudart -lrt //' ../Makefile.package
-    sed -i -e 's/-I..\/..\/lib\/cuda -I$(CUDA_INSTALL_PATH)\/include //' ../Makefile.package
-    sed -i -e 's/-L..\/..\/lib\/cuda -L$(CUDA_INSTALL_PATH)\/lib64 -L$(CUDA_INSTALL_PATH)\/lib $(USRLIB_CONDITIONAL) -DLMP_USER_CUDA //' ../Makefile.package
+    sed -i -e '/^include.*cuda.*$/d' ../Makefile.package
+    sed -i -e 's/[^ \t]*cuda[^ \t]* //g' ../Makefile.package
+    sed -i -e 's/[^ \t]*CUDA[^ \t]* //g' ../Makefile.package
+    sed -i -e 's/[^ \t]*lrt[^ \t]* //g' ../Makefile.package
  fi

+  rm ../atom_vec_angle_cuda.cpp
+  rm ../atom_vec_atomic_cuda.cpp
+  rm ../atom_vec_charge_cuda.cpp
+  rm ../atom_vec_full_cuda.cpp
  rm ../comm_cuda.cpp
+  rm ../compute_pe_cuda.cpp
+  rm ../compute_pressure_cuda.cpp
+  rm ../compute_temp_cuda.cpp
+  rm ../compute_temp_partial_cuda.cpp
  rm ../domain_cuda.cpp
+  rm ../fft3d_cuda.cpp
+  rm ../fft3d_wrap_cuda.cpp
+  rm ../fix_addforce_cuda.cpp
+  rm ../fix_aveforce_cuda.cpp
+  rm ../fix_enforce2d_cuda.cpp
+  rm ../fix_freeze_cuda.cpp
+  rm ../fix_gravity_cuda.cpp
+  rm ../fix_nh_cuda.cpp
+  rm ../fix_npt_cuda.cpp
+  rm ../fix_nve_cuda.cpp
+  rm ../fix_nvt_cuda.cpp
+  rm ../fix_set_force_cuda.cpp
+  rm ../fix_shake_cuda.cpp
+  rm ../fix_temp_berendsen_cuda.cpp
+  rm ../fix_temp_rescale_cuda.cpp
+  rm ../fix_temp_rescale_limit_cuda.cpp
+  rm ../fix_viscous_cuda.cpp
  rm ../modify_cuda.cpp
  rm ../neighbor_cuda.cpp
  rm ../neigh_full_cuda.cpp
+  rm ../pair_born_coul_long_cuda.cpp
+  rm ../pair_buck_coul_cut_cuda.cpp
+  rm ../pair_buck_coul_long_cuda.cpp
+  rm ../pair_buck_cuda.cpp
+  rm ../pair_cg_cmm_coul_cut_cuda.cpp
+  rm ../pair_cg_cmm_coul_debye_cuda.cpp
+  rm ../pair_cg_cmm_coul_long_cuda.cpp
+  rm ../pair_cg_cmm_cuda.cpp
+  rm ../pair_eam_alloy_cuda.cpp
+  rm ../pair_eam_cuda.cpp
+  rm ../pair_eam_fs_cuda.cpp
+  rm ../pair_gran_hooke_cuda.cpp
+  rm ../pair_lj96_cut_cuda.cpp
+  rm ../pair_lj_charmm_coul_charmm_cuda.cpp
+  rm ../pair_lj_charmm_coul_charmm_implicit_cuda.cpp
+  rm ../pair_lj_charmm_coul_long_cuda.cpp
+  rm ../pair_lj_class2_coul_cut_cuda.cpp
+  rm ../pair_lj_class2_coul_long_cuda.cpp
+  rm ../pair_lj_class2_cuda.cpp
+  rm ../pair_lj_cut_coul_cut_cuda.cpp
+  rm ../pair_lj_cut_coul_debye_cuda.cpp
+  rm ../pair_lj_cut_coul_long_cuda.cpp
+  rm ../pair_lj_cut_cuda.cpp
+  rm ../pair_lj_cut_experimental_cuda.cpp
+  rm ../pair_lj_expand_cuda.cpp
+  rm ../pair_lj_gromacs_coul_gromacs_cuda.cpp
+  rm ../pair_lj_gromacs_cuda.cpp
+  rm ../pair_lj_smooth_cuda.cpp
+  rm ../pair_morse_cuda.cpp
+  rm ../pppm_cuda.cpp
  rm ../verlet_cuda.cpp

  rm ../cuda.cpp
  rm ../cuda_neigh_list.cpp

+  rm ../atom_vec_angle_cuda.h
+  rm ../atom_vec_atomic_cuda.h
+  rm ../atom_vec_charge_cuda.h
+  rm ../atom_vec_full_cuda.h
  rm ../comm_cuda.h
+  rm ../compute_pe_cuda.h
+  rm ../compute_pressure_cuda.h
+  rm ../compute_temp_cuda.h
+  rm ../compute_temp_partial_cuda.h
  rm ../domain_cuda.h
+  rm ../fft3d_cuda.h
+  rm ../fft3d_wrap_cuda.h
+  rm ../fix_addforce_cuda.h
+  rm ../fix_aveforce_cuda.h
+  rm ../fix_enforce2d_cuda.h
+  rm ../fix_freeze_cuda.h
+  rm ../fix_gravity_cuda.h
+  rm ../fix_nh_cuda.h
+  rm ../fix_npt_cuda.h
+  rm ../fix_nve_cuda.h
+  rm ../fix_nvt_cuda.h
+  rm ../fix_set_force_cuda.h
+  rm ../fix_shake_cuda.h
+  rm ../fix_temp_berendsen_cuda.h
+  rm ../fix_temp_rescale_cuda.h
+  rm ../fix_temp_rescale_limit_cuda.h
+  rm ../fix_viscous_cuda.h
  rm ../modify_cuda.h
  rm ../neighbor_cuda.h
+  rm ../pair_born_coul_long_cuda.h
+  rm ../pair_buck_coul_cut_cuda.h
+  rm ../pair_buck_coul_long_cuda.h
+  rm ../pair_buck_cuda.h
+  rm ../pair_cg_cmm_coul_cut_cuda.h
+  rm ../pair_cg_cmm_coul_debye_cuda.h
+  rm ../pair_cg_cmm_coul_long_cuda.h
+  rm ../pair_cg_cmm_cuda.h
+  rm ../pair_eam_alloy_cuda.h
+  rm ../pair_eam_cuda.h
+  rm ../pair_eam_fs_cuda.h
+  rm ../pair_gran_hooke_cuda.h
+  rm ../pair_lj96_cut_cuda.h
+  rm ../pair_lj_charmm_coul_charmm_cuda.h
+  rm ../pair_lj_charmm_coul_charmm_implicit_cuda.h
+  rm ../pair_lj_charmm_coul_long_cuda.h
+  rm ../pair_lj_class2_coul_cut_cuda.h
+  rm ../pair_lj_class2_coul_long_cuda.h
+  rm ../pair_lj_class2_cuda.h
+  rm ../pair_lj_cut_coul_cut_cuda.h
+  rm ../pair_lj_cut_coul_debye_cuda.h
+  rm ../pair_lj_cut_coul_long_cuda.h
+  rm ../pair_lj_cut_cuda.h
+  rm ../pair_lj_cut_experimental_cuda.h
+  rm ../pair_lj_expand_cuda.h
+  rm ../pair_lj_gromacs_coul_gromacs_cuda.h
+  rm ../pair_lj_gromacs_cuda.h
+  rm ../pair_lj_smooth_cuda.h
+  rm ../pair_morse_cuda.h
+  rm ../pppm_cuda.h
  rm ../verlet_cuda.h

  rm ../cuda.h
--- a/src/USER-CUDA/atom_vec_angle_cuda.cpp
+++ b/src/USER-CUDA/atom_vec_angle_cuda.cpp
@ -0,0 +1,476 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_angle_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_angle_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "universe.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image molecule
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecAngleCuda::AtomVecAngleCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecAngle(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL) 
+	error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'."); 
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecAngleCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecAngleCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+   if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecAngleCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecAngle::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecAngleCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecAngleCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAngleCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecAngleCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAngleCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_reverse(n,first,buf);
+
+  int i,m,last;
+  cuda->cu_f->download();
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  cuda->cu_f->upload();
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAngleCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecAngle::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  cuda->cu_f->download();
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+  cuda->cu_f->upload();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAngleCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_border(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecAngleCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+  return m;
+}
+
+int AtomVecAngleCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAngle::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecAngleCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAngleCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAngleCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+void AtomVecAngleCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAngle::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAngleCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecAngleCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecAngle::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecAngleCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+
+  int nsend_atoms = Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecAngleCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+  
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecAngleCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+ 
+  double* buf_p=*buf_pointer;
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    int i=static_cast <int> (buf_p[j+1]);
+    int nextra=0;
+    int k;
+    buf_p[m++] = num_bond[i];
+    for (k = 0; k < num_bond[i]; k++) {
+      buf_p[m++] = bond_type[i][k];
+      buf_p[m++] = bond_atom[i][k];
+    }
+    nextra+=2*num_bond[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = num_angle[i];
+    for (k = 0; k < num_angle[i]; k++) {
+      buf_p[m++] = angle_type[i][k];
+      buf_p[m++] = angle_atom1[i][k];
+      buf_p[m++] = angle_atom2[i][k];
+      buf_p[m++] = angle_atom3[i][k];
+    }
+    nextra+=4*num_angle[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = nspecial[i][0];
+    buf_p[m++] = nspecial[i][1];
+    buf_p[m++] = nspecial[i][2];
+    for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k];
+    nextra+=nspecial[i][2]+3;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+  
+    if (atom->nextra_grow)
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      {
+        int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]);
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+    	if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+      }
+
+    if(i<nlocal)AtomVecAngle::copy(copylist[j],i,1);  
+    (*buf_pointer)[j+1] = nextra;
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+ //printf("End Pack Exchange\n");
+  if(m==1) return 0;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAngleCuda::unpack_exchange(double *buf)
+{
+// printf("Begin UnPack Exchange\n");
+  if(cuda->oncpu)
+  	return AtomVecAngle::unpack_exchange(buf);
+  
+  double *sublo,*subhi;
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecAngleCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data
+  int naccept = Cuda_AtomVecAngleCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+ 	  int k;
+	  int i=copylist[j];
+      num_bond[i] = static_cast<int> (buf[m++]);
+      for (k = 0; k < num_bond[i]; k++) {
+    	bond_type[i][k] = static_cast<int> (buf[m++]);
+    	bond_atom[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_angle[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_angle[i]; k++) {
+    	angle_type[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom1[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom2[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom3[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  nspecial[i][0] = static_cast<int> (buf[m++]);
+  	  nspecial[i][1] = static_cast<int> (buf[m++]);
+  	  nspecial[i][2] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < nspecial[i][2]; k++)
+    	special[i][k] = static_cast<int> (buf[m++]);
+    	
+  	  if (atom->nextra_grow)
+        for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(i,&buf[m]);
+    	
+    }
+    else 
+    m+=static_cast <int> (buf[j+1]);
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
--- a/src/USER-CUDA/atom_vec_angle_cuda.h
+++ b/src/USER-CUDA/atom_vec_angle_cuda.h
@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(angle/cuda,AtomVecAngleCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_ANGLE_CUDA_H
+#define LMP_ATOM_VEC_ANGLE_CUDA_H
+
+#include "atom_vec_angle.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecAngleCuda : public AtomVecAngle {
+ public:
+  AtomVecAngleCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecAngleCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/atom_vec_atomic_cuda.cpp
+++ b/src/USER-CUDA/atom_vec_atomic_cuda.cpp
@ -0,0 +1,407 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_atomic_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_atomic_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 11 //nextra x y z vx vy vz tag type mask image
+
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecAtomicCuda::AtomVecAtomicCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecAtomic(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecAtomicCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecAtomicCuda::grow_send(int n,double** buf_send,int flag)
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+   if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecAtomicCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecAtomic::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecAtomicCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecAtomicCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicCuda::unpack_comm(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecAtomicCuda::unpack_comm_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicCuda::pack_reverse(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_reverse(n,first,buf);
+
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicCuda::unpack_reverse(int n, int *list, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecAtomic::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_border(n,iswap,buf,pbc_flag,pbc);
+  
+  int m = Cuda_AtomVecAtomicCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+
+  return m;
+}
+
+int AtomVecAtomicCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecAtomic::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  
+  int m = Cuda_AtomVecAtomicCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+
+  return m;
+}
+/* ---------------------------------------------------------------------- */
+
+void AtomVecAtomicCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) 
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAtomicCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+
+}
+
+void AtomVecAtomicCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecAtomic::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) 
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecAtomicCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecAtomicCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecAtomic::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+  
+  int nsend_atoms = Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) {grow_copylist(nsend_atoms+100);}
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecAtomicCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+  
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+  
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecAtomicCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+      int i=static_cast <int> ((*buf_pointer)[j+1]);
+      int nextra=0;
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
+      	
+        int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m]));
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+        if(m>*maxsend)  grow_send(m,buf_pointer,1);
+      }
+      (*buf_pointer)[j+1] = nextra;
+      
+  }
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+
+  if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecAtomicCuda::unpack_exchange(double *buf)
+{
+  //printf("Unpack Begin\n");
+  if(cuda->oncpu)
+  	return AtomVecAtomic::unpack_exchange(buf);
+
+  double *sublo,*subhi;
+
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecAtomicCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost);
+  int naccept = Cuda_AtomVecAtomicCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+    		for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(copylist[j],&buf[m]);
+    }
+    else
+    {
+      m+=static_cast <int> (buf[j+1]);
+    }
+  }
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
--- a/src/USER-CUDA/atom_vec_atomic_cuda.h
+++ b/src/USER-CUDA/atom_vec_atomic_cuda.h
@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+#ifdef ATOM_CLASS
+
+AtomStyle(atomic/cuda,AtomVecAtomicCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_ATOMIC_CUDA_H
+#define LMP_ATOM_VEC_ATOMIC_CUDA_H
+
+#include "atom_vec_atomic.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecAtomicCuda : public AtomVecAtomic {
+ public:
+  AtomVecAtomicCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecAtomicCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/atom_vec_charge_cuda.cpp
+++ b/src/USER-CUDA/atom_vec_charge_cuda.cpp
@ -0,0 +1,407 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_charge_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_charge_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 12 //nextra x y z vx vy vz tag type mask image q
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecChargeCuda::AtomVecChargeCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecCharge(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecChargeCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecChargeCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+   if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecChargeCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecCharge::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecChargeCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecChargeCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecChargeCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecChargeCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecChargeCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_reverse(n,first,buf);
+
+  int i,m,last;
+  cuda->cu_f->download();
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  cuda->cu_f->upload();
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecChargeCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecCharge::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  cuda->cu_f->download();
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+  cuda->cu_f->upload();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecChargeCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_border(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecChargeCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+int AtomVecChargeCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecCharge::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecChargeCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecChargeCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecChargeCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+void AtomVecChargeCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecCharge::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecChargeCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecChargeCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecCharge::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecChargeCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+
+  int nsend_atoms = Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecChargeCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+  
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+    
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecChargeCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+      int i=static_cast <int> ((*buf_pointer)[j+1]);
+      int nextra=0;
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) {
+      	
+        int dm = modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&((*buf_pointer)[m]));
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+        if(m>*maxsend)  grow_send(m,buf_pointer,1);
+      }
+      (*buf_pointer)[j+1] = nextra;
+  }
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+
+  if(m==1) return 0;//m is at least 1 in cuda since buf[0] contains number of atoms
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecChargeCuda::unpack_exchange(double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecCharge::unpack_exchange(buf);
+  double *sublo,*subhi;
+
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecChargeCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost);
+  int naccept = Cuda_AtomVecChargeCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+  if (atom->nextra_grow)
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+    		for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(copylist[j],&buf[m]);
+    }
+    else
+    m+=static_cast <int> (buf[j+1]);
+  }
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
--- a/src/USER-CUDA/atom_vec_charge_cuda.h
+++ b/src/USER-CUDA/atom_vec_charge_cuda.h
@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(charge/cuda,AtomVecChargeCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_CHARGE_CUDA_H
+#define LMP_ATOM_VEC_CHARGE_CUDA_H
+
+#include "atom_vec_charge.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecChargeCuda : public AtomVecCharge {
+ public:
+  AtomVecChargeCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecChargeCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/atom_vec_full_cuda.cpp
+++ b/src/USER-CUDA/atom_vec_full_cuda.cpp
@ -0,0 +1,516 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "atom_vec_full_cuda.h"
+#include "comm_cuda_cu.h"
+#include "atom_vec_full_cuda_cu.h"
+#include "atom.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+#include "universe.h"
+#include "comm.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+#define BUFFACTOR 1.5
+#define BUFEXTRA 1000
+#define NCUDAEXCHANGE 13 //nextra x y z vx vy vz tag type mask image q molecule
+
+#define BUF_FLOAT double
+/* ---------------------------------------------------------------------- */
+
+AtomVecFullCuda::AtomVecFullCuda(LAMMPS *lmp, int narg, char **arg) :
+  AtomVecFull(lmp, narg, arg)
+{
+   cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+   maxsend=0;
+   cudable=true;
+   cuda_init_done=false;
+   max_nsend=0;
+   cu_copylist=NULL;
+   copylist=NULL;
+   copylist2=NULL;
+}
+
+void AtomVecFullCuda::grow_copylist(int new_max_nsend)
+{
+  max_nsend=new_max_nsend;
+  delete cu_copylist;
+  delete [] copylist2;
+  if(copylist) CudaWrapper_FreePinnedHostData((void*) copylist);
+  copylist = (int*) CudaWrapper_AllocPinnedHostData(max_nsend*sizeof(int),false);
+  copylist2 = new int[max_nsend];
+  cu_copylist = new cCudaData<int, int, xx > (copylist, max_nsend);
+}
+
+void AtomVecFullCuda::grow_send(int n,double** buf_send,int flag)  //need to be able to grow the comm send_buffer since the array sahll be copied from the gpu in whole
+{
+  int old_maxsend=*maxsend+BUFEXTRA;
+  *maxsend = static_cast<int> (BUFFACTOR * n);
+  if (flag)
+  {
+    if(cuda->pinned)
+    {
+      double* tmp = new double[old_maxsend];
+      memcpy((void*) tmp,(void*) *buf_send,old_maxsend*sizeof(double));
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+      memcpy(*buf_send,tmp,old_maxsend*sizeof(double));
+      delete [] tmp;	        	
+    }
+    else
+    {
+     *buf_send = (double *) 
+      memory->srealloc(*buf_send,(*maxsend+BUFEXTRA)*sizeof(double),
+		       "comm:buf_send");
+    }
+  }
+  else {
+    if(cuda->pinned)
+    {
+      if(*buf_send) CudaWrapper_FreePinnedHostData((void*) (*buf_send));
+      *buf_send = (double*) CudaWrapper_AllocPinnedHostData((*maxsend+BUFEXTRA)*sizeof(double),false);
+    }
+    else
+    {
+      memory->sfree(*buf_send);
+      *buf_send = (double *) memory->smalloc((*maxsend+BUFEXTRA)*sizeof(double),
+					  "comm:buf_send");
+    }
+  }
+}
+
+void AtomVecFullCuda::grow_both(int n)
+{
+  if(cuda->finished_setup)
+  cuda->downloadAll();	
+  AtomVecFull::grow(n);
+  if(cuda->finished_setup)
+  {
+    cuda->checkResize();
+    cuda->uploadAll();
+  }
+}
+
+int AtomVecFullCuda::pack_comm(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_comm(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackComm(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+int AtomVecFullCuda::pack_comm_vel(int n, int* iswap, double *buf,
+			     int pbc_flag, int *pbc) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_comm_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_CommCuda_PackCommVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	if((sizeof(X_FLOAT)!=sizeof(double)) && m)
+	  m=(m+1)*sizeof(X_FLOAT)/sizeof(double);
+	return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecFullCuda::unpack_comm(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_comm(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackComm(&cuda->shared_data,n,first,(void*)buf);
+}
+
+void AtomVecFullCuda::unpack_comm_vel(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only positions are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_comm_vel(n,first,buf); return;}
+
+  Cuda_CommCuda_UnpackCommVel(&cuda->shared_data,n,first,(void*)buf);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecFullCuda::pack_reverse(int n, int first, double *buf) //usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_reverse(n,first,buf);
+
+  int i,m,last;
+  cuda->cu_f->download();
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = f[i][0];
+    buf[m++] = f[i][1];
+    buf[m++] = f[i][2];
+  }
+  cuda->cu_f->upload();
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecFullCuda::unpack_reverse(int n, int *list, double *buf)//usually this should not be called since comm->communicate handles the communication if only forces are exchanged
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	{AtomVecFull::unpack_reverse(n,list,buf); return;}
+
+  int i,j,m;
+
+  m = 0;
+  cuda->cu_f->download();
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    f[j][0] += buf[m++];
+    f[j][1] += buf[m++];
+    f[j][2] += buf[m++];
+  }
+  cuda->cu_f->upload();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecFullCuda::pack_border(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_border(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecFullCuda_PackBorder(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+  return m;
+}
+
+int AtomVecFullCuda::pack_border_vel(int n, int *iswap, double *buf,
+			       int pbc_flag, int *pbc)
+{
+ if(not cuda->finished_setup || cuda->oncpu)
+  	return AtomVecFull::pack_border_vel(n,iswap,buf,pbc_flag,pbc);
+  	
+	int m = Cuda_AtomVecFullCuda_PackBorderVel(&cuda->shared_data,n,*iswap,(void*) buf,pbc,pbc_flag);
+	
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecFullCuda::unpack_border(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_border(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecFullCuda_UnpackBorder(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+void AtomVecFullCuda::unpack_border_vel(int n, int first, double *buf)
+{
+  if(not cuda->finished_setup || cuda->oncpu)
+  	 {AtomVecFull::unpack_border_vel(n,first,buf); return;}
+  while(atom->nghost+atom->nlocal+n>=cuda->shared_data.atom.nmax) //ensure there is enough space on device to unpack data
+  {
+  	grow_both(0);
+  }
+  int flag=Cuda_AtomVecFullCuda_UnpackBorderVel(&cuda->shared_data,n,first,(void*)buf);
+  if(flag) {printf(" # CUDA: Error: Failed to unpack Border atoms (This might be a bug).\n");}
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   xyz must be 1st 3 values, so comm::exchange() can test on them 
+------------------------------------------------------------------------- */
+
+
+int AtomVecFullCuda::pack_exchange(int dim, double *buf)
+{
+  if(cuda->oncpu)
+  	return AtomVecFull::pack_exchange(dim,buf);
+
+  if(not cuda_init_done||domain->box_change)
+  {
+  	Cuda_AtomVecFullCuda_Init(&cuda->shared_data);
+  	cuda_init_done=true;
+  }
+  double** buf_pointer=(double**) buf;
+  if(*maxsend<atom->nghost || *buf_pointer==NULL)
+  {
+  	grow_send(atom->nghost>*maxsend?atom->nghost:*maxsend,buf_pointer,0);
+  	*maxsend=atom->nghost>*maxsend?atom->nghost:*maxsend;
+  }
+  
+  if(max_nsend==0) grow_copylist(200);
+
+  int nsend_atoms = Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+  if(nsend_atoms*NCUDAEXCHANGE>*maxsend) 
+  {
+  	grow_send((int) (nsend_atoms+100)*NCUDAEXCHANGE,buf_pointer,0);
+  	Cuda_AtomVecFullCuda_PackExchangeList(&cuda->shared_data,*maxsend,dim,*buf_pointer);
+  }
+
+  int nlocal=atom->nlocal-nsend_atoms;
+  
+  for(int i=0;i<nsend_atoms;i++) copylist2[i]=1;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i>=nlocal) copylist2[i-nlocal]=-1;
+  }
+  
+  int actpos=0;
+  for(int j=1;j<nsend_atoms+1;j++)
+  {
+  	int i = static_cast <int> ((*buf_pointer)[j]);
+  	if(i<nlocal) 
+  	{
+  	  while(copylist2[actpos]==-1) actpos++;
+    	  copylist[j-1]=nlocal+actpos;
+  	  actpos++;
+  	}
+  }
+  cu_copylist->upload();
+  
+  cuda->shared_data.atom.nlocal=nlocal;
+  
+  int m = Cuda_AtomVecFullCuda_PackExchange(&cuda->shared_data,nsend_atoms,*buf_pointer,cu_copylist->dev_data());
+  
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+ 
+  double* buf_p=*buf_pointer;
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    int i=static_cast <int> (buf_p[j+1]);
+    int nextra=0;
+    int k;
+    buf_p[m++] = num_bond[i];
+    for (k = 0; k < num_bond[i]; k++) {
+      buf_p[m++] = bond_type[i][k];
+      buf_p[m++] = bond_atom[i][k];
+    }
+    nextra+=2*num_bond[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+    
+    buf_p[m++] = num_angle[i];
+    for (k = 0; k < num_angle[i]; k++) {
+      buf_p[m++] = angle_type[i][k];
+      buf_p[m++] = angle_atom1[i][k];
+      buf_p[m++] = angle_atom2[i][k];
+      buf_p[m++] = angle_atom3[i][k];
+    }
+    nextra+=4*num_angle[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = num_dihedral[i];
+    for (k = 0; k < num_dihedral[i]; k++) {
+      buf_p[m++] = dihedral_type[i][k];
+      buf_p[m++] = dihedral_atom1[i][k];
+      buf_p[m++] = dihedral_atom2[i][k];
+      buf_p[m++] = dihedral_atom3[i][k];
+      buf_p[m++] = dihedral_atom4[i][k];
+    }
+    nextra+=5*num_dihedral[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = num_improper[i];
+    for (k = 0; k < num_improper[i]; k++) {
+      buf_p[m++] = improper_type[i][k];
+      buf_p[m++] = improper_atom1[i][k];
+      buf_p[m++] = improper_atom2[i][k];
+      buf_p[m++] = improper_atom3[i][k];
+      buf_p[m++] = improper_atom4[i][k];
+    }
+    nextra+=5*num_improper[i]+1;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+
+    buf_p[m++] = nspecial[i][0];
+    buf_p[m++] = nspecial[i][1];
+    buf_p[m++] = nspecial[i][2];
+    for (k = 0; k < nspecial[i][2]; k++) buf_p[m++] = special[i][k];
+    nextra+=nspecial[i][2]+3;
+    if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+  
+    if (atom->nextra_grow)
+      for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      {
+        int dm= modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf_p[m]);
+        m+=dm;
+  		nextra+=dm;
+        if(i<nlocal)modify->fix[atom->extra_grow[iextra]]->copy_arrays(copylist[j],i);
+        if(m>*maxsend) {grow_send(m,buf_pointer,1); buf_p=*buf_pointer;}
+      }
+
+    if(i<nlocal)AtomVecFull::copy(copylist[j],i,1);  
+    (*buf_pointer)[j+1] = nextra;
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  (*buf_pointer)[0] = nsend_atoms;
+  atom->nlocal-=nsend_atoms;
+  cuda->shared_data.atom.update_nlocal=2;
+ //printf("End Pack Exchange\n");
+  if(m==1) return 0;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecFullCuda::unpack_exchange(double *buf)
+{
+// printf("Begin UnPack Exchange\n");
+  if(cuda->oncpu)
+  	return AtomVecFull::unpack_exchange(buf);
+  
+  double *sublo,*subhi;
+  int dim=cuda->shared_data.exchange_dim;
+  if(domain->box_change) 
+  Cuda_AtomVecFullCuda_Init(&cuda->shared_data);
+  if (domain->triclinic == 0) {
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  int mfirst=0;
+  for(int pi=0;pi<(comm->procgrid[dim]>2?2:1);pi++)
+  {
+  int nlocal = atom->nlocal;
+  int nsend_atoms=static_cast<int> (buf[0]);
+  if(nsend_atoms>max_nsend) grow_copylist(nsend_atoms+100);
+ 
+  if (nlocal+nsend_atoms+atom->nghost>=atom->nmax) grow_both(nlocal+nsend_atoms*2+atom->nghost); //ensure there is enough space on device to unpack data
+  int naccept = Cuda_AtomVecFullCuda_UnpackExchange(&cuda->shared_data,nsend_atoms,buf,cu_copylist->dev_data());
+  cu_copylist->download();
+  int m = nsend_atoms*NCUDAEXCHANGE + 1;
+  nlocal+=naccept;
+
+  timespec time1,time2;
+  clock_gettime(CLOCK_REALTIME,&time1);
+
+  for(int j=0;j<nsend_atoms;j++)
+  {
+    if(copylist[j]>-1)
+    {
+ 	  int k;
+	  int i=copylist[j];
+      num_bond[i] = static_cast<int> (buf[m++]);
+      for (k = 0; k < num_bond[i]; k++) {
+    	bond_type[i][k] = static_cast<int> (buf[m++]);
+    	bond_atom[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_angle[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_angle[i]; k++) {
+    	angle_type[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom1[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom2[i][k] = static_cast<int> (buf[m++]);
+    	angle_atom3[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_dihedral[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_dihedral[i]; k++) {
+    	dihedral_type[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom1[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom2[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom3[i][k] = static_cast<int> (buf[m++]);
+    	dihedral_atom4[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  num_improper[i] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < num_improper[i]; k++) {
+    	improper_type[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom1[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom2[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom3[i][k] = static_cast<int> (buf[m++]);
+    	improper_atom4[i][k] = static_cast<int> (buf[m++]);
+  	  }
+
+  	  nspecial[i][0] = static_cast<int> (buf[m++]);
+  	  nspecial[i][1] = static_cast<int> (buf[m++]);
+  	  nspecial[i][2] = static_cast<int> (buf[m++]);
+  	  for (k = 0; k < nspecial[i][2]; k++)
+    	special[i][k] = static_cast<int> (buf[m++]);
+    	
+  	  if (atom->nextra_grow)
+        for (int iextra = 0; iextra < atom->nextra_grow; iextra++) 
+      				m += modify->fix[atom->extra_grow[iextra]]->
+					unpack_exchange(i,&buf[m]);
+    	
+    }
+    else 
+    m+=static_cast <int> (buf[j+1]);
+  }
+	  
+	  clock_gettime(CLOCK_REALTIME,&time2);
+	  cuda->shared_data.cuda_timings.comm_exchange_cpu_pack+=
+        time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+  cuda->shared_data.atom.nlocal=nlocal;
+  cuda->shared_data.atom.update_nlocal=2;
+  atom->nlocal=nlocal;
+  mfirst+=m;
+  buf=&buf[m];
+  }
+  return mfirst;
+}
+
+
+
--- a/src/USER-CUDA/atom_vec_full_cuda.h
+++ b/src/USER-CUDA/atom_vec_full_cuda.h
@ -0,0 +1,69 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(full/cuda,AtomVecFullCuda)
+
+#else
+
+#ifndef LMP_ATOM_VEC_FULL_CUDA_H
+#define LMP_ATOM_VEC_FULL_CUDA_H
+
+#include "atom_vec_full.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecFullCuda : public AtomVecFull {
+ public:
+  AtomVecFullCuda(class LAMMPS *, int, char **);
+  virtual ~AtomVecFullCuda() {}
+  void grow_copylist(int n);
+  void grow_send(int n,double** buf_send,int flag);
+  void grow_both(int n);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+ private:
+  class Cuda *cuda;
+  bool cuda_init_done;
+  int* copylist;
+  int* copylist2;
+  cCudaData<int, int, xx >* cu_copylist;
+  int max_nsend;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/comm_cuda.cpp
+++ b/src/USER-CUDA/comm_cuda.cpp
@ -55,6 +55,8 @@ enum{SINGLE,MULTI};
 CommCuda::CommCuda(LAMMPS *lmp):Comm(lmp) 
 {
  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");

  cu_pbc=NULL;
  cu_slablo=NULL;
--- a/src/USER-CUDA/comm_cuda.cu
+++ b/src/USER-CUDA/comm_cuda.cu
@ -0,0 +1,483 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#define MY_PREFIX comm_cuda
+#include "cuda_shared.h"
+#include "cuda_common.h"
+
+#include "crm_cuda_utils.cu"
+
+#include "comm_cuda_cu.h"
+#include "comm_cuda_kernel.cu"
+#include <ctime>
+
+void Cuda_CommCuda_UpdateBuffer(cuda_shared_data* sdata,int n)
+{
+		int size=n*3*sizeof(X_FLOAT);
+		if(sdata->buffersize<size)
+		{
+			MYDBG(printf("Cuda_ComputeTempCuda Resizing Buffer at %p with %i kB to\n",sdata->buffer,sdata->buffersize);)
+			CudaWrapper_FreeCudaData(sdata->buffer,sdata->buffersize);
+			sdata->buffer = CudaWrapper_AllocCudaData(size);
+			sdata->buffersize=size;
+			sdata->buffer_new++;
+			MYDBG(printf("New buffer at %p with %i kB\n",sdata->buffer,sdata->buffersize);)
+		}
+		cudaMemcpyToSymbol(MY_CONST(buffer), & sdata->buffer, sizeof(int*)     );
+}
+
+
+void Cuda_CommCuda_UpdateNmax(cuda_shared_data* sdata)
+{
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(nmax)    , & sdata->atom.nmax          , sizeof(int)      );
+		cudaMemcpyToSymbol(MY_CONST(x)       , & sdata->atom.x    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(v)       , & sdata->atom.v    .dev_data, sizeof(X_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(f)       , & sdata->atom.f    .dev_data, sizeof(F_FLOAT*) );
+		cudaMemcpyToSymbol(MY_CONST(type)    , & sdata->atom.type .dev_data, sizeof(int*) 	  );
+}
+
+
+void Cuda_CommCuda_Init(cuda_shared_data* sdata)
+{
+	Cuda_CommCuda_UpdateNmax(sdata);
+	int ntypesp=sdata->atom.ntypes+1;
+    cudaMemcpyToSymbol(MY_CONST(cuda_ntypes)   , &ntypesp, sizeof(int));
+    cudaMemcpyToSymbol(MY_CONST(prd)   , sdata->domain.prd, 3*sizeof(X_FLOAT));
+    cudaMemcpyToSymbol(MY_CONST(flag)  , &sdata->flag, sizeof(int*));
+  	cudaMemcpyToSymbol(MY_CONST(debugdata)  , &sdata->debugdata, sizeof(int*));
+}
+
+int Cuda_CommCuda_PackComm(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_pack+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+      
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+      if(not sdata->overlap_comm)
+        cudaMemcpy(buf_send, sdata->buffer, n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return 3*n;
+}
+
+int Cuda_CommCuda_PackCommVel(cuda_shared_data* sdata,int n,int iswap,void* buf_send,int* pbc,int pbc_flag)
+{
+
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemset( sdata->flag,0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  void* buf=sdata->overlap_comm?sdata->comm.buf_send_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_PackComm_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n
+	  ,sdata->comm.maxlistlength,iswap,dx,dy,dz,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_pack+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+      
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+      if(not sdata->overlap_comm)
+        cudaMemcpy(buf_send, sdata->buffer, n*6*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+      //cudaMemcpy(buf_send, sdata->comm.buf_send_dev[iswap], n*3*sizeof(X_FLOAT), cudaMemcpyDeviceToHost);
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_download+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  int aflag;
+	  cudaMemcpy(&aflag, sdata->flag, sizeof(int), cudaMemcpyDeviceToHost);
+	  if(aflag!=0) printf("aflag PackComm: %i\n",aflag);
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm: Kernel execution failed");
+		
+	}		
+    return 6*n;
+}
+
+int Cuda_CommCuda_PackComm_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_self+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return 3*n;
+}
+
+int Cuda_CommCuda_PackCommVel_Self(cuda_shared_data* sdata,int n,int iswap,int first,int* pbc,int pbc_flag)
+{
+	MYDBG(printf(" # CUDA: CommCuda_PackComm_Self\n");)
+    timespec time1,time2;
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+	static int count=-1;
+	count++;
+	X_FLOAT dx=0.0;
+	X_FLOAT dy=0.0;
+	X_FLOAT dz=0.0;
+ 	if (pbc_flag != 0) {
+    if (sdata->domain.triclinic == 0) {
+      dx = pbc[0]*sdata->domain.prd[0];
+      dy = pbc[1]*sdata->domain.prd[1];
+      dz = pbc[2]*sdata->domain.prd[2];
+    } else {
+      dx = pbc[0]*sdata->domain.prd[0] + pbc[5]*sdata->domain.xy + pbc[4]*sdata->domain.xz;
+      dy = pbc[1]*sdata->domain.prd[1] + pbc[3]*sdata->domain.yz;
+      dz = pbc[2]*sdata->domain.prd[2];
+    }}	
+
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+
+clock_gettime(CLOCK_REALTIME,&time1);
+
+	  Cuda_CommCuda_PackComm_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,dx,dy,dz,first);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_kernel_self+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackComm_Self: Kernel execution failed");
+	}	
+	
+    return 6*n;
+}
+
+void Cuda_CommCuda_UnpackComm(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
+{
+    timespec time1,time2;
+
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+clock_gettime(CLOCK_REALTIME,&time1);
+      if(not sdata->overlap_comm||iswap<0)
+	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*3*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_upload+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_kernel_unpack+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
+
+void Cuda_CommCuda_UnpackCommVel(cuda_shared_data* sdata,int n,int first,void* buf_recv,int iswap)
+{
+    timespec time1,time2;
+
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*6*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+clock_gettime(CLOCK_REALTIME,&time1);
+
+      if(not sdata->overlap_comm||iswap<0)
+	    cudaMemcpy(sdata->buffer,(void*)buf_recv, n*6*sizeof(X_FLOAT), cudaMemcpyHostToDevice);
+
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_forward_upload+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+	  void* buf=(sdata->overlap_comm&&iswap>=0)?sdata->comm.buf_recv_dev[iswap]:sdata->buffer;
+	  Cuda_CommCuda_UnpackComm_Kernel<<<grid, threads,0>>>(n,first,buf);
+	  cudaThreadSynchronize();
+
+clock_gettime(CLOCK_REALTIME,&time1);
+sdata->cuda_timings.comm_forward_kernel_unpack+=
+      time1.tv_sec-time2.tv_sec+1.0*(time1.tv_nsec-time2.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackComm: Kernel execution failed");
+		
+	}		
+}
+
+int Cuda_CommCuda_PackReverse(cuda_shared_data* sdata,int n,int first,void* buf_send)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(F_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+
+	F_FLOAT* buf=(F_FLOAT*)buf_send;
+	F_FLOAT* f_dev=(F_FLOAT*)sdata->atom.f.dev_data;
+	f_dev+=first;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	buf+=n; f_dev+=sdata->atom.nmax;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	buf+=n; f_dev+=sdata->atom.nmax;
+	cudaMemcpy(buf, f_dev, n*sizeof(F_FLOAT), cudaMemcpyDeviceToHost);
+	return 	n*3;
+}
+
+
+void Cuda_CommCuda_UnpackReverse(cuda_shared_data* sdata,int n,int iswap,void* buf_recv)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(F_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	if(sdata->atom.nlocal>0)
+	{
+	  cudaMemcpy(sdata->buffer,buf_recv, size, cudaMemcpyHostToDevice);
+	  Cuda_CommCuda_UnpackReverse_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_CommCuda_UnpackReverse: Kernel execution failed");		
+	}		
+}
+
+void Cuda_CommCuda_UnpackReverse_Self(cuda_shared_data* sdata,int n,int iswap,int first)
+{
+	if(sdata->atom.update_nmax) 
+		Cuda_CommCuda_UpdateNmax(sdata);
+	if(sdata->atom.update_nlocal) 		
+		cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	int size=n*3*sizeof(X_FLOAT);
+	if(sdata->buffer_new or (size>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,n);
+
+	int3 layout=getgrid(n);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x, layout.y, 1);
+	  
+	if(sdata->atom.nlocal>0)
+	{
+	  Cuda_CommCuda_UnpackReverse_Self_Kernel<<<grid, threads,0>>>((int*) sdata->comm.sendlist.dev_data,n,sdata->comm.maxlistlength,iswap,first);
+	  cudaThreadSynchronize();
+	  CUT_CHECK_ERROR("Cuda_CommCuda_PackReverse_Self: Kernel execution failed");
+		
+	}		
+}
+
+
+int Cuda_CommCuda_BuildSendlist(cuda_shared_data* sdata,int bordergroup,int ineed,int style,int atom_nfirst,int nfirst,int nlast,int dim,int iswap)
+{
+	MYDBG(printf(" # CUDA: CommCuda_BuildSendlist\n");)
+    timespec time1,time2;
+	Cuda_CommCuda_UpdateNmax(sdata);
+	cudaMemcpyToSymbol(MY_CONST(nlocal)  , & sdata->atom.nlocal        , sizeof(int)      );
+	if(sdata->buffer_new or (80>sdata->buffersize))
+		Cuda_CommCuda_UpdateBuffer(sdata,10);
+	int n;
+	if (!bordergroup || ineed >= 2)
+	n=nlast-nfirst+1;
+	else
+	{
+	  n=atom_nfirst;
+	  if(nlast-sdata->atom.nlocal+1>n) n=nlast-sdata->atom.nlocal+1;
+	}
+	int3 layout=getgrid(n,0,512,true);
+	dim3 threads(layout.z, 1, 1);
+	dim3 grid(layout.x+1, layout.y, 1);
+
+
+    cudaMemset((int*) (sdata->buffer),0,sizeof(int));
+
+clock_gettime(CLOCK_REALTIME,&time1);
+	if(style==1)
+	Cuda_CommCuda_BuildSendlist_Single<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.slablo.dev_data,(X_FLOAT*) sdata->comm.slabhi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
+	else
+	Cuda_CommCuda_BuildSendlist_Multi<<<grid, threads,(threads.x+1)*sizeof(int)>>>(bordergroup, ineed, atom_nfirst, nfirst, nlast, dim, iswap,(X_FLOAT*) sdata->comm.multilo.dev_data,(X_FLOAT*) sdata->comm.multihi.dev_data,(int*) sdata->comm.sendlist.dev_data,sdata->comm.maxlistlength);
+    cudaThreadSynchronize();
+clock_gettime(CLOCK_REALTIME,&time2);
+sdata->cuda_timings.comm_border_kernel_buildlist+=
+      time2.tv_sec-time1.tv_sec+1.0*(time2.tv_nsec-time1.tv_nsec)/1000000000;
+
+	  CUT_CHECK_ERROR("Cuda_CommCuda_BuildSendlist: Kernel execution failed");
+    int nsend;
+	cudaMemcpy(&nsend, sdata->buffer, sizeof(int), cudaMemcpyDeviceToHost);
+	return nsend;
+	
+	
+}
+
--- a/src/USER-CUDA/compute_pe_cuda.cpp
+++ b/src/USER-CUDA/compute_pe_cuda.cpp
@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstring>
+#include "compute_pe_cuda.h"
+#include "atom.h"
+#include "update.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "modify.h"
+#include "domain.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputePECuda::ComputePECuda(LAMMPS *lmp, int narg, char **arg) : 
+  ComputePE(lmp, narg, arg)
+{
+  cudable = 1;
+}
--- a/src/USER-CUDA/compute_pe_cuda.h
+++ b/src/USER-CUDA/compute_pe_cuda.h
@ -0,0 +1,59 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(pe/cuda,ComputePECuda)
+
+#else
+
+#ifndef LMP_COMPUTE_PE_CUDA_H
+#define LMP_COMPUTE_PE_CUDA_H
+
+#include "compute_pe.h"
+
+namespace LAMMPS_NS {
+
+class ComputePECuda : public ComputePE {
+ public:
+  ComputePECuda(class LAMMPS *, int, char **);
+  ~ComputePECuda() {}
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/compute_pressure_cuda.cpp
+++ b/src/USER-CUDA/compute_pressure_cuda.cpp
@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstring>
+#include <cstdlib>
+#include "compute_pressure_cuda.h"
+#include "atom.h"
+#include "update.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+enum{DUMMY0,INVOKED_SCALAR,INVOKED_VECTOR,DUMMMY3,INVOKED_PERATOM};
+
+/* ---------------------------------------------------------------------- */
+
+ComputePressureCuda::ComputePressureCuda(LAMMPS *lmp, int narg, char **arg) :
+  ComputePressure(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+  cudable = 1;
+  
+  // store temperature ID used by pressure computation
+  // insure it is valid for temperature computation
+
+  int n = strlen(arg[3]) + 1;
+  char* id_temp = new char[n];
+  strcpy(id_temp,arg[3]);
+
+  int icompute = modify->find_compute(id_temp);
+  delete id_temp;
+  if (modify->compute[icompute]->cudable == 0)
+  {
+    error->warning("Compute pressure/cuda temperature ID is not cudable! Try a temp/cuda style.");
+    cudable = 0;
+  }
+  
+}
+
+double ComputePressureCuda::compute_scalar()
+{
+  if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll();
+  ComputePressure::compute_scalar();
+}
+
+void ComputePressureCuda::compute_vector()
+{
+  if(not temperature->cudable && cuda->finished_setup) cuda->downloadAll();
+  ComputePressure::compute_vector();
+}
--- a/src/USER-CUDA/compute_pressure_cuda.h
+++ b/src/USER-CUDA/compute_pressure_cuda.h
@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(pressure/cuda,ComputePressureCuda)
+
+#else
+
+#ifndef LMP_COMPUTE_PRESSURE_CUDA_H
+#define LMP_COMPUTE_PRESSURE_CUDA_H
+
+#include "compute_pressure.h"
+
+namespace LAMMPS_NS {
+
+class ComputePressureCuda : public ComputePressure {
+ public:
+  ComputePressureCuda(class LAMMPS *, int, char **);
+  ~ComputePressureCuda() {}
+  double compute_scalar();
+  void compute_vector();
+
+  private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/compute_temp_cuda.cpp
+++ b/src/USER-CUDA/compute_temp_cuda.cpp
@ -0,0 +1,212 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "compute_temp_cuda.h"
+#include "compute_temp_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "force.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "group.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempCuda::ComputeTempCuda(LAMMPS *lmp, int narg, char **arg) : 
+  Compute(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 3) error->all("Illegal compute temp/cuda command");
+
+  scalar_flag = vector_flag = 1;
+  size_vector = 6;
+  extscalar = 0;
+  extvector = 1;
+  tempflag = 1;
+
+  vector = new double[6];
+  cu_t_vector = 0;
+  cu_t_scalar = 0;
+  cudable=true;
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempCuda::~ComputeTempCuda()
+{
+  delete [] vector;
+  delete cu_t_vector;
+  delete cu_t_scalar;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempCuda::init()
+{
+  fix_dof = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    fix_dof += modify->fix[i]->dof(igroup);
+  dof_compute();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempCuda::dof_compute()
+{
+  double natoms = group->count(igroup);
+  dof = domain->dimension * natoms;
+  dof -= extra_dof + fix_dof;
+  if (dof > 0.0) tfactor = force->mvv2e / (dof * force->boltz);
+  else tfactor = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ComputeTempCuda::compute_scalar()
+{
+  if(cuda->begin_setup)
+  {
+  	if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  	if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+    invoked_scalar = update->ntimestep;
+    Cuda_ComputeTempCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_scalar->dev_data());
+    cu_t_scalar->download();
+  }
+  else
+  {
+  invoked_scalar = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double t = 0.0;
+
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i];
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * 
+	  mass[type[i]];
+  }
+  t_scalar=t;
+  }
+  
+  MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
+  if (dynamic) dof_compute();
+  scalar *= tfactor;
+  if(scalar>1e15) 
+  {
+  	cuda->cu_v->download();
+  	cuda->cu_x->download();
+  	cuda->cu_type->download();
+    double **v = atom->v;
+    double **x = atom->x;
+    printf("Out of v-range atoms:  \n"); 
+  	for(int i=0;i<atom->nlocal;i++) 
+  	if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5) 
+  	printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]);
+  	error->all("Temperature out of range. Simulations will be abortet.\n");
+  }
+  return scalar;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempCuda::compute_vector()
+{
+  int i;
+  if(cuda->begin_setup)
+  {
+  if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+
+  invoked_vector = update->ntimestep;
+
+  Cuda_ComputeTempCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_vector->dev_data());
+  cu_t_vector->download();
+  }
+  else
+  {
+ 
+  invoked_vector = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double massone,t[6];
+  for (i = 0; i < 6; i++) t[i] = 0.0;
+
+  for (i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit) {
+      if (rmass) massone = rmass[i];
+      else massone = mass[type[i]];
+      t[0] += massone * v[i][0]*v[i][0];
+      t[1] += massone * v[i][1]*v[i][1];
+      t[2] += massone * v[i][2]*v[i][2];
+      t[3] += massone * v[i][0]*v[i][1];
+      t[4] += massone * v[i][0]*v[i][2];
+      t[5] += massone * v[i][1]*v[i][2];
+    }
+  
+  for (i = 0; i < 6; i++) t_vector[i]=t[i];
+  }
+  MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world);
+  for (i = 0; i < 6; i++) vector[i] *= force->mvv2e;
+}
--- a/src/USER-CUDA/compute_temp_cuda.h
+++ b/src/USER-CUDA/compute_temp_cuda.h
@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(temp/cuda,ComputeTempCuda)
+
+#else
+
+#ifndef LMP_COMPUTE_TEMP_CUDA_H
+#define LMP_COMPUTE_TEMP_CUDA_H
+
+#include "compute.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class ComputeTempCuda : public Compute {
+ public:
+  ComputeTempCuda(class LAMMPS *, int, char **);
+  ~ComputeTempCuda();
+  void init();
+  double compute_scalar();
+  void compute_vector();
+
+ private:
+  class Cuda *cuda;
+  int fix_dof;
+  double tfactor;
+
+  void dof_compute();
+  double t_vector[6];
+  double t_scalar;
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_scalar;	
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_vector;	
+  
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/compute_temp_partial_cuda.cpp
+++ b/src/USER-CUDA/compute_temp_partial_cuda.cpp
@ -0,0 +1,357 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstdlib>
+#include <cstdio>
+#include <cstring>
+#include "compute_temp_partial_cuda.h"
+#include "compute_temp_partial_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "force.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "group.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempPartialCuda::ComputeTempPartialCuda(LAMMPS *lmp, int narg, char **arg) : 
+  Compute(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 6) error->all("Illegal compute temp/partial command");
+
+  scalar_flag = vector_flag = 1;
+  size_vector = 6;
+  extscalar = 0;
+  extvector = 1;
+  tempflag = 1;
+  tempbias = 1;
+  
+  xflag = atoi(arg[3]);
+  yflag = atoi(arg[4]);
+  zflag = atoi(arg[5]);
+  if (zflag && domain->dimension == 2)
+    error->all("Compute temp/partial cannot use vz for 2d systemx");
+
+  maxbias = 0;
+  vbiasall = NULL;
+
+  vector = new double[6];
+  cu_t_vector = 0;
+  cu_t_scalar = 0;
+  cu_vbiasall=NULL;
+  cudable=true;
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeTempPartialCuda::~ComputeTempPartialCuda()
+{
+  memory->destroy(vbiasall);
+  delete [] vector;
+  delete cu_t_vector;
+  delete cu_t_scalar;
+  delete cu_vbiasall;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::init()
+{
+  fix_dof = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    fix_dof += modify->fix[i]->dof(igroup);
+  dof_compute();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::dof_compute()
+{
+  double natoms = group->count(igroup);
+  int nper = xflag+yflag+zflag;
+  dof = nper * natoms;
+  dof -= (1.0*nper/domain->dimension)*fix_dof + extra_dof;
+  if (dof > 0) tfactor = force->mvv2e / (dof * force->boltz);
+  else tfactor = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int ComputeTempPartialCuda::dof_remove(int i)
+{
+  int nper = xflag+yflag+zflag;
+  return (domain->dimension - nper);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ComputeTempPartialCuda::compute_scalar()
+{
+  if(cuda->begin_setup)
+  {
+  	if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  	if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+    invoked_scalar = update->ntimestep;
+    Cuda_ComputeTempPartialCuda_Scalar(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_scalar->dev_data(),xflag,yflag,zflag);
+    cu_t_scalar->download();
+  }
+  else
+  {
+  invoked_scalar = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double t = 0.0;
+
+  if (rmass) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) * rmass[i];
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (xflag*v[i][0]*v[i][0] + yflag*v[i][1]*v[i][1] + zflag*v[i][2]*v[i][2]) * 
+	  mass[type[i]];
+  }
+  t_scalar=t;
+  }
+  
+  MPI_Allreduce(&t_scalar,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
+  if (dynamic) dof_compute();
+  scalar *= tfactor;
+  if(scalar>1e15) 
+  {
+  	cuda->cu_v->download();
+  	cuda->cu_x->download();
+  	cuda->cu_type->download();
+    double **v = atom->v;
+    double **x = atom->x;
+    printf("Out of v-range atoms:  \n"); 
+  	for(int i=0;i<atom->nlocal;i++) 
+  	if((v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2])>1e5) 
+  	printf("%i %i // %lf %lf %lf // %lf %lf %lf\n",atom->tag[i],atom->type[i],x[i][0], x[i][1], x[i][2],v[i][0], v[i][1], v[i][2]);
+  	error->all("Temperature out of range. Simulations will be abortet.\n");
+  }
+  return scalar;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::compute_vector()
+{
+  int i;
+  if(cuda->begin_setup)
+  {
+  if(not cu_t_vector) cu_t_vector = new cCudaData<double, ENERGY_FLOAT, x> (t_vector,6);    
+  if(not cu_t_scalar) cu_t_scalar = new cCudaData<double, ENERGY_FLOAT, x> (&t_scalar,1);    
+
+  invoked_vector = update->ntimestep;
+
+  Cuda_ComputeTempPartialCuda_Vector(&cuda->shared_data,groupbit,(ENERGY_FLOAT*) cu_t_vector->dev_data(),xflag,yflag,zflag);
+  cu_t_vector->download();
+  }
+  else
+  {
+ 
+  invoked_vector = update->ntimestep;
+
+  double **v = atom->v;
+  double *mass = atom->mass;
+  double *rmass = atom->rmass;
+  int *type = atom->type;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  double massone,t[6];
+  for (i = 0; i < 6; i++) t[i] = 0.0;
+
+  for (i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit) {
+      if (rmass) massone = rmass[i];
+      else massone = mass[type[i]];
+      t[0] += massone * xflag*v[i][0]*v[i][0];
+      t[1] += massone * yflag*v[i][1]*v[i][1];
+      t[2] += massone * zflag*v[i][2]*v[i][2];
+      t[3] += massone * xflag*yflag*v[i][0]*v[i][1];
+      t[4] += massone * xflag*zflag*v[i][0]*v[i][2];
+      t[5] += massone * yflag*zflag*v[i][1]*v[i][2];
+    }
+  
+  for (i = 0; i < 6; i++) t_vector[i]=t[i];
+  }
+  MPI_Allreduce(t_vector,vector,6,MPI_DOUBLE,MPI_SUM,world);
+  for (i = 0; i < 6; i++) vector[i] *= force->mvv2e;
+}
+
+/* ----------------------------------------------------------------------
+   remove velocity bias from atom I to leave thermal velocity
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::remove_bias(int i, double *v)
+{
+  if (!xflag) {
+    vbias[0] = v[0];
+    v[0] = 0.0;
+  }
+  if (!yflag) {
+    vbias[1] = v[1];
+    v[1] = 0.0;
+  }
+  if (!zflag) {
+    vbias[2] = v[2];
+    v[2] = 0.0;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   remove velocity bias from all atoms to leave thermal velocity
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::remove_bias_all()
+{
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  if (nlocal > maxbias) {
+    memory->destroy(vbiasall);
+    maxbias = atom->nmax;
+    memory->create(vbiasall,maxbias,3,"temp/partial:vbiasall");
+	delete cu_vbiasall;
+	cu_vbiasall = new cCudaData<double, V_FLOAT, yx> ((double*)vbiasall, atom->nmax, 3);
+  }
+  if(cuda->begin_setup)
+  {
+  		Cuda_ComputeTempPartialCuda_RemoveBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data());
+  }
+  else
+  {
+  if (!xflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	vbiasall[i][0] = v[i][0];
+	v[i][0] = 0.0;
+      }
+  }
+  if (!yflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	vbiasall[i][1] = v[i][1];
+	v[i][1] = 0.0;
+      }
+  }
+  if (!zflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	vbiasall[i][2] = v[i][2];
+	v[i][2] = 0.0;
+      }
+  }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   add back in velocity bias to atom I removed by remove_bias()
+   assume remove_bias() was previously called
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::restore_bias(int i, double *v)
+{
+  if (!xflag) v[0] += vbias[0];
+  if (!yflag) v[1] += vbias[1];
+  if (!zflag) v[2] += vbias[2];
+}
+
+/* ----------------------------------------------------------------------
+   add back in velocity bias to all atoms removed by remove_bias_all()
+   assume remove_bias_all() was previously called
+------------------------------------------------------------------------- */
+
+void ComputeTempPartialCuda::restore_bias_all()
+{
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if(cuda->begin_setup)
+  {
+  		Cuda_ComputeTempPartialCuda_RestoreBiasAll(&cuda->shared_data,groupbit,xflag,yflag,zflag,cu_vbiasall->dev_data());
+  }
+  else
+  {
+
+  if (!xflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	v[i][0] += vbiasall[i][0];
+  }
+  if (!yflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	v[i][1] += vbiasall[i][1];
+  }
+  if (!zflag) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	v[i][2] += vbiasall[i][2];
+  }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ComputeTempPartialCuda::memory_usage()
+{
+  double bytes = maxbias * sizeof(double);
+  return bytes;
+}
--- a/src/USER-CUDA/compute_temp_partial_cuda.h
+++ b/src/USER-CUDA/compute_temp_partial_cuda.h
@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+
+ComputeStyle(temp/partial/cuda,ComputeTempPartialCuda)
+
+#else
+
+#ifndef LMP_COMPUTE_TEMP_PARTIAL_CUDA_H
+#define LMP_COMPUTE_TEMP_PARTIAL_CUDA_H
+
+#include "compute.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class ComputeTempPartialCuda : public Compute {
+ public:
+  ComputeTempPartialCuda(class LAMMPS *, int, char **);
+  ~ComputeTempPartialCuda();
+  void init();
+  double compute_scalar();
+  void compute_vector();
+
+  int dof_remove(int);
+  void remove_bias(int, double *);
+  void remove_bias_all();
+  void restore_bias(int, double *);
+  void restore_bias_all();
+  double memory_usage();
+
+ private:
+  class Cuda *cuda;
+  int xflag,yflag,zflag;
+  int fix_dof;
+  double tfactor;
+
+  void dof_compute();
+  double t_vector[6];
+  double t_scalar;
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_scalar;	
+  cCudaData<double     , ENERGY_FLOAT   		, x>* cu_t_vector;	
+  cCudaData<double, V_FLOAT, yx>* cu_vbiasall;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/cuda.cpp
+++ b/src/USER-CUDA/cuda.cpp
@ -230,7 +230,7 @@ void Cuda::accelerator(int narg, char** arg)
 	  {
 	  	if(++i==narg) 
 	  	  error->all("Invalid Options for 'accelerator' command. Expecting a string after 'suffix' option."); 
-	  	strcpy(lmp->asuffix,arg[i]);
+	  	strcpy(lmp->suffix,arg[i]);
 	  }
 	  if(strcmp(arg[i],"overlap_comm")==0) 
 	  {
--- a/src/USER-CUDA/cuda_neigh_list.cpp
+++ b/src/USER-CUDA/cuda_neigh_list.cpp
@ -29,12 +29,16 @@
 #include <algorithm>
 #include "cuda.h"
 #include "atom.h"
+#include "error.h"

 using namespace LAMMPS_NS;

 CudaNeighList::CudaNeighList(LAMMPS *lmp, class NeighList* neigh_list) : Pointers(lmp)
 {
        cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
 	MYDBG(printf("# CUDA: CudaNeighList::cudaNeighList() ... start\n");)
 	this->neigh_list = neigh_list;
 	neigh_list->cuda_list=this;
--- a/src/USER-CUDA/domain_cuda.cpp
+++ b/src/USER-CUDA/domain_cuda.cpp
@ -54,6 +54,8 @@ enum{NO_REMAP,X_REMAP,V_REMAP};                   // same as fix_deform.cpp
 DomainCuda::DomainCuda(LAMMPS *lmp) : Domain(lmp)
 {
  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
 }

 /* ---------------------------------------------------------------------- */
--- a/src/USER-CUDA/fft3d_cuda.cpp
+++ b/src/USER-CUDA/fft3d_cuda.cpp
@ -0,0 +1,608 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Jim Shepherd (GA Tech) added SGI SCSL support
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include "fft3d_cuda.h"
+#include "fft3d_cuda_cu.h"
+#include "remap.h"
+#include <ctime>
+#include "cuda_wrapper_cu.h"
+
+#ifdef FFT_CUFFT
+#endif
+#define MIN(A,B) ((A) < (B)) ? (A) : (B)
+#define MAX(A,B) ((A) > (B)) ? (A) : (B)
+
+/* ----------------------------------------------------------------------
+   Data layout for 3d FFTs:
+
+   data set of Nfast x Nmid x Nslow elements is owned by P procs
+   on input, each proc owns a subsection of the elements
+   on output, each proc will own a (possibly different) subsection
+   my subsection must not overlap with any other proc's subsection,
+     i.e. the union of all proc's input (or output) subsections must
+     exactly tile the global Nfast x Nmid x Nslow data set
+   when called from C, all subsection indices are 
+     C-style from 0 to N-1 where N = Nfast or Nmid or Nslow
+   when called from F77, all subsection indices are 
+     F77-style from 1 to N where N = Nfast or Nmid or Nslow
+   a proc can own 0 elements on input or output
+     by specifying hi index < lo index
+   on both input and output, data is stored contiguously on a processor
+     with a fast-varying, mid-varying, and slow-varying index
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Perform 3d FFT 
+
+   Arguments:
+   in           starting address of input data on this proc
+   out          starting address of where output data for this proc
+                  will be placed (can be same as in)
+   flag         1 for forward FFT, -1 for inverse FFT
+   plan         plan returned by previous call to fft_3d_create_plan
+------------------------------------------------------------------------- */
+
+void fft_3d_cuda(FFT_DATA *in, FFT_DATA *out, int flag, struct fft_plan_3d *plan)
+{
+#ifdef FFT_CUFFT
+  plan->iterate++;
+  timespec starttime,starttime2;
+  timespec endtime,endtime2;
+	
+  int i,total,length,offset,num;
+  double norm;
+  FFT_DATA *data,*copy;
+  // system specific constants 
+
+
+  // pre-remap to prepare for 1st FFTs if needed
+  // copy = loc for remap result 
+  int nprocs=plan->nprocs;
+if(nprocs>1)
+{
+  if(plan->init)
+  clock_gettime(CLOCK_REALTIME,&starttime);
+  if (plan->pre_plan) {
+    if (plan->pre_target == 0) copy = out;
+    else copy = plan->copy;
+    if(plan->init) remap_3d((double *) in, (double *) out, (double *) plan->scratch,plan->pre_plan);
+    data = out;
+  }
+  else
+    data = in;
+}
+  cufftResult retvalc;
+  if(plan->init)
+  {
+	if(nprocs>1)
+	{
+      if(sizeof(FFT_FLOAT)==sizeof(double))cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize/2,cudaMemcpyHostToDevice);
+      if(sizeof(FFT_FLOAT)==sizeof(float)) cudaMemcpy((void*) (plan->cudata2), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice);
+      initfftdata((double*)plan->cudata2,(FFT_FLOAT*)plan->cudata,plan->nfast,plan->nmid,plan->nslow);
+    }
+  }
+    if (flag == -1)
+    {
+      retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_FORWARD);
+    }
+    else
+    {
+      retvalc=cufft(plan->plan_3d, plan->cudata, plan->cudata2,CUFFT_INVERSE);
+    }
+    if(retvalc!=CUFFT_SUCCESS) {printf("ErrorCUFFT: %i\n",retvalc);exit(EXIT_FAILURE);}
+
+    FFTsyncthreads();
+#endif
+}
+/* ----------------------------------------------------------------------
+   Create plan for performing a 3d FFT 
+
+   Arguments:
+   comm                 MPI communicator for the P procs which own the data
+   nfast,nmid,nslow     size of global 3d matrix
+   in_ilo,in_ihi        input bounds of data I own in fast index
+   in_jlo,in_jhi        input bounds of data I own in mid index
+   in_klo,in_khi        input bounds of data I own in slow index
+   out_ilo,out_ihi      output bounds of data I own in fast index
+   out_jlo,out_jhi      output bounds of data I own in mid index
+   out_klo,out_khi      output bounds of data I own in slow index
+   scaled               0 = no scaling of result, 1 = scaling
+   permute              permutation in storage order of indices on output
+                          0 = no permutation
+			  1 = permute once = mid->fast, slow->mid, fast->slow
+			  2 = permute twice = slow->fast, fast->mid, mid->slow
+   nbuf                 returns size of internal storage buffers used by FFT
+------------------------------------------------------------------------- */
+
+struct fft_plan_3d *fft_3d_create_plan_cuda(
+       MPI_Comm comm, int nfast, int nmid, int nslow,
+       int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+       int in_klo, int in_khi,
+       int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+       int out_klo, int out_khi,
+       int scaled, int permute, int *nbuf,bool ainit)
+{
+#ifdef FFT_CUFFT
+  struct fft_plan_3d *plan;
+  int me,nprocs;
+  int i,num,flag,remapflag,fftflag;
+  int first_ilo,first_ihi,first_jlo,first_jhi,first_klo,first_khi;
+  int second_ilo,second_ihi,second_jlo,second_jhi,second_klo,second_khi;
+  int third_ilo,third_ihi,third_jlo,third_jhi,third_klo,third_khi;
+  int out_size,first_size,second_size,third_size,copy_size,scratch_size;
+  int np1,np2,ip1,ip2;
+  int list[50];
+
+  // system specific variables 
+
+  // query MPI info 
+
+  MPI_Comm_rank(comm,&me);
+  MPI_Comm_size(comm,&nprocs);
+
+#ifndef FFT_CUFFT
+    error->all("ERROR: Trying to use cuda fft without FFT_CUFFT set. Recompile with make option 'cufft=1'.");
+#endif
+  // compute division of procs in 2 dimensions not on-processor 
+  bifactor_cuda(nprocs,&np1,&np2);
+  ip1 = me % np1;
+  ip2 = me/np1;
+
+  // in case of CUDA FFT every proc does the full FFT in order to avoid data transfers (the problem is other wise heavily bandwidth limited)
+
+  int ip1out = ip1;
+  int ip2out = ip2;
+  int np1out = np1;
+  int np2out = np2;
+  
+  ip1 = 0;
+  ip2 = 0;
+  np1 = 1;
+  np2 = 1;
+
+  // allocate memory for plan data struct 
+
+  plan = (struct fft_plan_3d *) malloc(sizeof(struct fft_plan_3d));
+  if (plan == NULL) return NULL;
+  plan->init=ainit;
+
+  // remap from initial distribution to layout needed for 1st set of 1d FFTs
+  // not needed if all procs own entire fast axis initially
+  // first indices = distribution after 1st set of FFTs 
+
+  if (in_ilo == 0 && in_ihi == nfast-1)
+    flag = 0;
+  else
+    flag = 1;
+
+  if(nprocs>1)flag=1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0) {
+    first_ilo = in_ilo;
+    first_ihi = in_ihi;
+    first_jlo = in_jlo;
+    first_jhi = in_jhi;
+    first_klo = in_klo;
+    first_khi = in_khi;
+    plan->pre_plan = NULL;
+  }
+  else {
+    first_ilo = 0;
+    first_ihi = nfast - 1;
+    first_jlo = ip1*nmid/np1;
+    first_jhi = (ip1+1)*nmid/np1 - 1;
+    first_klo = ip2*nslow/np2;
+    first_khi = (ip2+1)*nslow/np2 - 1;
+    int members=2;
+    if(plan->init) members=1;
+    plan->pre_plan =
+      remap_3d_create_plan(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+			   first_ilo,first_ihi,first_jlo,first_jhi,
+			   first_klo,first_khi,
+			   members,0,0,2);
+    if (plan->pre_plan == NULL) return NULL;
+  }
+
+  // 1d FFTs along fast axis 
+
+  plan->length1 = nfast;
+  plan->total1 = nfast * nmid * nslow;
+
+  // remap from 1st to 2nd FFT
+  // choose which axis is split over np1 vs np2 to minimize communication
+  // second indices = distribution after 2nd set of FFTs 
+
+  second_ilo = ip1*nfast/np1;
+  second_ihi = (ip1+1)*nfast/np1 - 1;
+  second_jlo = 0;
+  second_jhi = nmid - 1;
+  second_klo = ip2*nslow/np2;
+  second_khi = (ip2+1)*nslow/np2 - 1;
+  plan->mid1_plan =
+      remap_3d_create_plan(comm,
+			   first_ilo,first_ihi,first_jlo,first_jhi,
+			   first_klo,first_khi,
+			   second_ilo,second_ihi,second_jlo,second_jhi,
+			   second_klo,second_khi,
+			   2,1,0,2);
+  if (plan->mid1_plan == NULL) return NULL;
+
+  // 1d FFTs along mid axis 
+
+  plan->length2 = nmid;
+  plan->total2 = nfast * nmid * nslow;
+
+  // remap from 2nd to 3rd FFT
+  // if final distribution is permute=2 with all procs owning entire slow axis
+  //   then this remapping goes directly to final distribution
+  //  third indices = distribution after 3rd set of FFTs 
+
+  flag=1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0) {
+    third_ilo = out_ilo;
+    third_ihi = out_ihi;
+    third_jlo = out_jlo;
+    third_jhi = out_jhi;
+    third_klo = out_klo;
+    third_khi = out_khi;
+  }
+  else {
+    third_ilo = ip1*nfast/np1;
+    third_ihi = (ip1+1)*nfast/np1 - 1;
+    third_jlo = ip2*nmid/np2;
+    third_jhi = (ip2+1)*nmid/np2 - 1;
+    third_klo = 0;
+    third_khi = nslow - 1;
+  }
+  
+  plan->mid2_plan =
+    remap_3d_create_plan(comm,
+			 second_jlo,second_jhi,second_klo,second_khi,
+			 second_ilo,second_ihi,
+			 third_jlo,third_jhi,third_klo,third_khi,
+			 third_ilo,third_ihi,
+			 2,1,0,2);
+  if (plan->mid2_plan == NULL) return NULL;
+
+  // 1d FFTs along slow axis 
+
+  plan->length3 = nslow;
+  plan->total3 = nfast * nmid * nslow;
+
+  // remap from 3rd FFT to final distribution
+  //  not needed if permute = 2 and third indices = out indices on all procs 
+
+  flag=1;
+
+  MPI_Allreduce(&flag,&remapflag,1,MPI_INT,MPI_MAX,comm);
+
+  if (remapflag == 0)
+    plan->post_plan = NULL;
+  else {
+    plan->post_plan =
+      remap_3d_create_plan(comm,
+			   third_klo,third_khi,third_ilo,third_ihi,
+			   third_jlo,third_jhi,
+			   out_klo,out_khi,out_ilo,out_ihi,
+			   out_jlo,out_jhi,
+			   2,(permute+1)%3,0,2);
+    if (plan->post_plan == NULL) return NULL;
+  }
+
+  // configure plan memory pointers and allocate work space
+  // out_size = amount of memory given to FFT by user
+  // first/second/third_size = amount of memory needed after pre,mid1,mid2 remaps
+  // copy_size = amount needed internally for extra copy of data
+  // scratch_size = amount needed internally for remap scratch space
+  // for each remap:
+  //   out space used for result if big enough, else require copy buffer
+  //   accumulate largest required remap scratch space 
+
+  out_size = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) * (out_khi-out_klo+1);
+  first_size = (first_ihi-first_ilo+1) * (first_jhi-first_jlo+1) * 
+    (first_khi-first_klo+1);
+  second_size = (second_ihi-second_ilo+1) * (second_jhi-second_jlo+1) * 
+    (second_khi-second_klo+1);
+  third_size = (third_ihi-third_ilo+1) * (third_jhi-third_jlo+1) * 
+    (third_khi-third_klo+1);
+
+  plan->ihi_out=out_ihi;
+  plan->ilo_out=out_ilo;
+  plan->jhi_out=out_jhi;
+  plan->jlo_out=out_jlo;
+  plan->khi_out=out_khi;
+  plan->klo_out=out_klo;
+
+  copy_size = 0;
+  scratch_size = 0;
+
+  if (plan->pre_plan) {
+    if (first_size <= out_size)
+      plan->pre_target = 0;
+    else {
+      plan->pre_target = 1;
+      copy_size = MAX(copy_size,first_size);
+    }
+    scratch_size = MAX(scratch_size,first_size);
+  }
+
+  if (plan->mid1_plan) {
+    if (second_size <= out_size)
+      plan->mid1_target = 0;
+    else {
+      plan->mid1_target = 1;
+      copy_size = MAX(copy_size,second_size);
+    }
+    scratch_size = MAX(scratch_size,second_size);
+  }
+
+  if (plan->mid2_plan) {
+    if (third_size <= out_size)
+      plan->mid2_target = 0;
+    else {
+      plan->mid2_target = 1;
+      copy_size = MAX(copy_size,third_size);
+    }
+    scratch_size = MAX(scratch_size,third_size);
+  }
+
+  if (plan->post_plan)
+    scratch_size = MAX(scratch_size,out_size);
+
+  *nbuf = copy_size + scratch_size;
+
+  if (copy_size) {
+    plan->copy = (FFT_DATA *) malloc(copy_size*sizeof(FFT_DATA));
+    if (plan->copy == NULL) return NULL;
+  }
+  else plan->copy = NULL;
+
+  if (scratch_size) {
+    plan->scratch = (FFT_DATA *) malloc(scratch_size*sizeof(FFT_DATA));
+    if (plan->scratch == NULL) return NULL;
+  }
+  else plan->scratch = NULL;
+
+  // system specific pre-computation of 1d FFT coeffs 
+  // and scaling normalization 
+
+  cufftResult retvalc;
+  int nfft = (in_ihi-in_ilo+1) * (in_jhi-in_jlo+1) *
+    (in_khi-in_klo+1);
+  int nfft_brick = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
+    (out_khi-out_klo+1);
+    
+  int nfft_both = MAX(nfft,nfft_brick);
+  nfft_both=nfast*nmid*nslow;
+
+  plan->cudatasize=nfft_both*sizeof(FFT_DATA);
+
+  //retvalc=cufftPlan1d(&(plan->plan_fast), nfast, CUFFT_PLAN,plan->total1/nfast);
+  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT1: %i\n",retvalc);
+  plan->nfast=nfast;
+
+  //retvalc=cufftPlan1d(&(plan->plan_mid), nmid, CUFFT_PLAN,plan->total2/nmid);
+  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT2: %i\n",retvalc);
+  plan->nmid=nmid;
+
+  //retvalc=cufftPlan1d(&(plan->plan_slow), nslow, CUFFT_PLAN,plan->total3/nslow);
+  //if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);
+  plan->nslow=nslow;
+
+  retvalc=cufftPlan3d(&(plan->plan_3d), nslow,nmid,nfast, CUFFT_PLAN);
+  if(retvalc!=CUFFT_SUCCESS) printf("ErrorCUFFT3: %i\n",retvalc);
+
+  plan->nprocs=nprocs;
+  plan->me=me;
+  if (scaled == 0)
+    plan->scaled = 0;
+  else {
+    plan->scaled = 1;
+    plan->norm = 1.0/(nfast*nmid*nslow);
+    plan->normnum = (out_ihi-out_ilo+1) * (out_jhi-out_jlo+1) *
+      (out_khi-out_klo+1);
+  }
+
+  plan->coretime=0;
+  plan->iterate=0;
+  plan->ffttime=0;
+  return plan;
+  #endif
+}
+
+/* ----------------------------------------------------------------------
+   Destroy a 3d fft plan 
+------------------------------------------------------------------------- */
+
+void fft_3d_destroy_plan_cuda(struct fft_plan_3d *plan)
+{
+#ifdef FFT_CUFFT
+  if (plan->pre_plan) remap_3d_destroy_plan(plan->pre_plan);
+  if (plan->mid1_plan) remap_3d_destroy_plan(plan->mid1_plan);
+  if (plan->mid2_plan) remap_3d_destroy_plan(plan->mid2_plan);
+  if (plan->post_plan) remap_3d_destroy_plan(plan->post_plan);
+
+  if (plan->copy) free(plan->copy);
+  if (plan->scratch) free(plan->scratch);
+
+
+  //cufftDestroy(plan->plan_fast);
+  //cufftDestroy(plan->plan_mid);
+  //cufftDestroy(plan->plan_slow);
+  cufftDestroy(plan->plan_3d);
+  free(plan);
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   recursively divide n into small factors, return them in list
+------------------------------------------------------------------------- */
+
+void factor_cuda(int n, int *num, int *list)
+{
+  if (n == 1) {
+    return;
+  }
+  else if (n % 2 == 0) {
+    *list = 2;
+    (*num)++;
+    factor_cuda(n/2,num,list+1);
+  }
+  else if (n % 3 == 0) {
+    *list = 3;
+    (*num)++;
+    factor_cuda(n/3,num,list+1);
+  }
+  else if (n % 5 == 0) {
+    *list = 5;
+    (*num)++;
+    factor_cuda(n/5,num,list+1);
+  }
+  else if (n % 7 == 0) {
+    *list = 7;
+    (*num)++;
+    factor_cuda(n/7,num,list+1);
+  }
+  else if (n % 11 == 0) {
+    *list = 11;
+    (*num)++;
+    factor_cuda(n/11,num,list+1);
+  }
+  else if (n % 13 == 0) {
+    *list = 13;
+    (*num)++;
+    factor_cuda(n/13,num,list+1);
+  }
+  else {
+    *list = n;
+    (*num)++;
+    return;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   divide n into 2 factors of as equal size as possible 
+------------------------------------------------------------------------- */
+
+void bifactor_cuda(int n, int *factor1, int *factor2)
+{
+  int n1,n2,facmax;
+
+  facmax = static_cast<int> (sqrt((double) n));
+
+  for (n1 = facmax; n1 > 0; n1--) {
+    n2 = n/n1;
+    if (n1*n2 == n) {
+      *factor1 = n1;
+      *factor2 = n2;
+      return;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform just the 1d FFTs needed by a 3d FFT, no data movement
+   used for timing purposes
+
+   Arguments:
+   in           starting address of input data on this proc, all set to 0.0
+   nsize        size of in
+   flag         1 for forward FFT, -1 for inverse FFT
+   plan         plan returned by previous call to fft_3d_create_plan
+------------------------------------------------------------------------- */
+
+void fft_1d_only_cuda(FFT_DATA *data, int nsize, int flag, struct fft_plan_3d *plan)
+{
+#ifdef FFT_CUFFT
+  int i,total,length,offset,num;
+  double norm;
+
+  // system specific constants 
+
+
+
+  // total = size of data needed in each dim
+  // length = length of 1d FFT in each dim
+  // total/length = # of 1d FFTs in each dim
+  // if total > nsize, limit # of 1d FFTs to available size of data
+
+  int total1 = plan->total1;
+  int length1 = plan->length1;
+  int total2 = plan->total2;
+  int length2 = plan->length2;
+  int total3 = plan->total3;
+  int length3 = plan->length3;
+
+  if (total1 > nsize) total1 = (nsize/length1) * length1;
+  if (total2 > nsize) total2 = (nsize/length2) * length2;
+  if (total3 > nsize) total3 = (nsize/length3) * length3;
+
+  // perform 1d FFTs in each of 3 dimensions
+  // data is just an array of 0.0
+
+
+  cudaMemcpy((void**) &(plan->cudata), (void*) data, plan->cudatasize,cudaMemcpyHostToDevice);
+  if (flag == -1) {
+    cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    cufft(plan->plan_mid, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_FORWARD);*/
+  } else {
+    cufft(plan->plan_3d, plan->cudata, plan->cudata,CUFFT_FORWARD);
+    /*cufft(plan->plan_fast, plan->cudata, plan->cudata,CUFFT_INVERSE);
+    cufft(plan->plan_mid,plan->cudata, plan->cudata,CUFFT_INVERSE);
+    cufft(plan->plan_slow, plan->cudata, plan->cudata,CUFFT_INVERSE);*/
+  }
+  cudaMemcpy((void*) data, (void**) &(plan->cudata), plan->cudatasize,cudaMemcpyDeviceToHost);
+
+  // scaling if required 
+  // limit num to size of data
+
+#endif
+}
--- a/src/USER-CUDA/fft3d_cuda.h
+++ b/src/USER-CUDA/fft3d_cuda.h
@ -0,0 +1,148 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+// User-settable FFT precision 
+
+// FFT_PRECISION = 1 is single-precision complex (4-byte real, 4-byte imag) 
+// FFT_PRECISION = 2 is double-precision complex (8-byte real, 8-byte imag) 
+#include "cuda_precision.h"
+//#define FFT_PRECISION 2
+
+// ------------------------------------------------------------------------- 
+
+// Data types for single-precision complex 
+
+#if FFT_PRECISION_CU == 1
+
+#ifdef FFT_CUFFT
+#include "cuda_runtime.h"
+#include "cufft.h"
+typedef struct {
+  float re;
+  float im;
+} FFT_DATA;
+typedef cufftComplex cufftData;
+typedef cufftReal cufftDataInit;
+#define cufft cufftExecC2C
+#define cufftinit cufftExecR2C
+#define CUFFT_PLAN CUFFT_C2C
+#define CUFFT_PLAN_INIT CUFFT_R2C
+#else
+typedef struct {
+  float re;
+  float im;
+} FFT_DATA;
+#endif
+
+#endif
+
+// ------------------------------------------------------------------------- 
+
+// Data types for double-precision complex 
+
+#if FFT_PRECISION_CU == 2
+
+
+#ifdef FFT_CUFFT
+#include "cuda_runtime.h"
+#include "cufft.h"
+typedef cufftDoubleComplex cufftData;
+typedef cufftDoubleReal cufftDataInit;
+typedef struct {
+  double re;
+  double im;
+} FFT_DATA;
+#define cufft cufftExecZ2Z
+#define cufftinit cufftExecD2Z
+#define CUFFT_PLAN CUFFT_Z2Z
+#define CUFFT_PLAN_INIT CUFFT_D2Z
+#endif
+
+#endif
+
+// ------------------------------------------------------------------------- 
+
+// details of how to do a 3d FFT 
+
+struct fft_plan_3d {
+  struct remap_plan_3d *pre_plan;       // remap from input -> 1st FFTs 
+  struct remap_plan_3d *mid1_plan;      // remap from 1st -> 2nd FFTs 
+  struct remap_plan_3d *mid2_plan;      // remap from 2nd -> 3rd FFTs 
+  struct remap_plan_3d *post_plan;      // remap from 3rd FFTs -> output 
+  FFT_DATA *copy;                   // memory for remap results (if needed) 
+  FFT_DATA *scratch;                // scratch space for remaps 
+  int total1,total2,total3;         // # of 1st,2nd,3rd FFTs (times length) 
+  int length1,length2,length3;      // length of 1st,2nd,3rd FFTs 
+  int pre_target;                   // where to put remap results 
+  int mid1_target,mid2_target;
+  int scaled;                       // whether to scale FFT results 
+  int normnum;                      // # of values to rescale 
+  double norm;                      // normalization factor for rescaling 
+
+  double coretime;
+  double ffttime;
+  int iterate;
+                                    // system specific 1d FFT info 
+
+#ifdef FFT_CUFFT
+  //CUdeviceptr cudata;
+  cufftData* cudata;
+  cufftData* cudata2;
+  unsigned int cudatasize;
+  cufftHandle plan_fast;
+  cufftHandle plan_mid;
+  cufftHandle plan_slow;
+  cufftHandle plan_3d;
+  int nfast;
+  int nmid;
+  int nslow;
+  int ihi_out,ilo_out,jhi_out,jlo_out,khi_out,klo_out;
+  int me,nprocs;
+#endif
+  int init;
+};
+
+// function prototypes 
+
+void fft_3d_destroy_plan_cuda(struct fft_plan_3d *);
+void factor_cuda(int, int *, int *);
+void bifactor_cuda(int, int *, int *);
+void fft_1d_only_cuda(FFT_DATA *, int, int, struct fft_plan_3d *);
+void fft_3d_cudaA(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *);
+void fft_3d_cuda(FFT_DATA *, FFT_DATA *, int, struct fft_plan_3d *);
+struct fft_plan_3d *fft_3d_create_plan_cuda(MPI_Comm, int, int, int,
+  int, int, int, int, int, int, int, int, int, int, int, int,
+  int, int, int *,bool init);
--- a/src/USER-CUDA/fft3d_wrap_cuda.cpp
+++ b/src/USER-CUDA/fft3d_wrap_cuda.cpp
@ -0,0 +1,111 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "fft3d_wrap_cuda.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FFT3dCuda::FFT3dCuda(LAMMPS *lmp, MPI_Comm comm, int nfast, int nmid, int nslow,
+	     int in_ilo, int in_ihi, int in_jlo, int in_jhi,
+	     int in_klo, int in_khi,
+	     int out_ilo, int out_ihi, int out_jlo, int out_jhi,
+	     int out_klo, int out_khi,
+	     int scaled, int permute, int *nbuf,bool init) : Pointers(lmp)
+{
+#ifdef FFT_CUFFT
+  plan = fft_3d_create_plan_cuda(comm,nfast,nmid,nslow,
+			    in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+			    out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
+			    scaled,permute,nbuf,init);
+#endif
+#ifndef FFT_CUFFT
+  plan = fft_3d_create_plan(comm,nfast,nmid,nslow,
+			    in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
+			    out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
+			    scaled,permute,nbuf);
+#endif
+  if (plan == NULL) error->one("Could not create 3d FFT plan");
+}
+
+/* ---------------------------------------------------------------------- */
+
+FFT3dCuda::~FFT3dCuda()
+{
+#ifdef FFT_CUFFT
+  fft_3d_destroy_plan_cuda(plan);
+#endif
+#ifndef FFT_CUFFT
+   fft_3d_destroy_plan(plan);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FFT3dCuda::compute(double *in, double *out, int flag)
+{
+#ifdef FFT_CUFFT
+  fft_3d_cuda((FFT_DATA *) in,(FFT_DATA *) out,flag,plan);
+#endif
+#ifndef FFT_CUFFT
+  fft_3d((FFT_DATA *) in,(FFT_DATA *) out,flag,plan);
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FFT3dCuda::timing1d(double *in, int nsize, int flag)
+{
+#ifdef FFT_CUFFT
+  fft_1d_only_cuda((FFT_DATA *) in,nsize,flag,plan);
+#endif
+#ifndef FFT_CUFFT
+  fft_1d_only((FFT_DATA *) in,nsize,flag,plan);
+#endif
+}
+
+#ifdef FFT_CUFFT
+void FFT3dCuda::set_cudata(void* cudata,void* cudata2)
+{ 
+  
+  plan->cudata=(cufftData*) cudata;
+  plan->cudata2=(cufftData*) cudata2;
+  
+}
+#endif
--- a/src/USER-CUDA/fft3d_wrap_cuda.h
+++ b/src/USER-CUDA/fft3d_wrap_cuda.h
@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef FFT3D_WRAP_CUDA_H_
+#define FFT3D_WRAP_CUDA_H_
+
+#include "pointers.h"
+
+#ifdef FFT_CUFFT
+  #include "fft3d_cuda.h"
+#endif
+#ifndef FFT_CUFFT
+  #include "fft3d.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class FFT3dCuda : protected Pointers {
+ public:
+  FFT3dCuda(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,int,int,int,
+	int,int,int,int,int,int,int,int,int *,bool);
+  ~FFT3dCuda();
+  void compute(double *, double *, int);
+  void timing1d(double *, int, int);
+
+#ifdef FFT_CUFFT
+  void set_cudata(void* cudata,void* cudata2);
+#endif
+ private:
+  struct fft_plan_3d *plan;
+};
+
+}
+
+#endif /*FFT3D_WRAP_CUDA_H_*/
--- a/src/USER-CUDA/fix_addforce_cuda.cpp
+++ b/src/USER-CUDA/fix_addforce_cuda.cpp
@ -0,0 +1,190 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+#include <cstring>
+#include <cstdlib>
+#include "fix_addforce_cuda.h"
+#include "fix_addforce_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "domain.h"
+#include "cuda.h"
+#include "memory.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixAddForceCuda::FixAddForceCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 6) error->all("Illegal fix addforce/cuda command");
+
+  scalar_flag = 1;
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extscalar = 1;
+  extvector = 1;
+
+  xvalue = atof(arg[3]);
+  yvalue = atof(arg[4]);
+  zvalue = atof(arg[5]);
+
+  // optional args
+
+  iregion = -1;
+
+  int iarg = 6;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"region") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix addforce/cuda command");
+      iregion = domain->find_region(arg[iarg+1]);
+      if (iregion == -1) error->all("Fix addforce/cuda region ID does not exist");
+      iarg += 2;
+    } else error->all("Illegal fix addforce/cuda command");
+  }
+  
+  if(iregion!=-1) error->all("Error: fix addforce/cuda does not currently support 'region' option");
+  
+  force_flag = 0;
+  foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
+  cu_foriginal = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixAddForceCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,4);    
+  if (strcmp(update->integrate_style,"respa") == 0)
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixAddForceCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixAddForceCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else {
+    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
+    cuda->cu_f->download();
+    post_force_respa(vflag,nlevels_respa-1,0);
+    cuda->cu_f->upload();
+    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
+  }
+  MYDBG( printf("# CUDA: FixAddForceCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::post_force(int vflag)
+{
+  MYDBG( printf("# CUDA: FixAddForceCuda::postforce start\n"); )
+  force_flag = 0;
+  cu_foriginal->memset_device(0);
+  Cuda_FixAddForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_FLOAT*) cu_foriginal->dev_data());
+  cu_foriginal->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAddForceCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ----------------------------------------------------------------------
+   potential energy of added force
+------------------------------------------------------------------------- */
+
+double FixAddForceCuda::compute_scalar()
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[0];
+}
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixAddForceCuda::compute_vector(int n)
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[n+1];
+}
--- a/src/USER-CUDA/fix_addforce_cuda.h
+++ b/src/USER-CUDA/fix_addforce_cuda.h
@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(addforce/cuda,FixAddForceCuda)
+
+#else
+
+#ifndef LMP_FIX_ADD_FORCE_CUDA_H
+#define LMP_FIX_ADD_FORCE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixAddForceCuda : public Fix {
+ public:
+  FixAddForceCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+  double compute_scalar();
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  int iregion;
+  double xvalue,yvalue,zvalue;
+  double foriginal[4],foriginal_all[4];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int force_flag;
+  int nlevels_respa;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_aveforce_cuda.cpp
+++ b/src/USER-CUDA/fix_aveforce_cuda.cpp
@ -0,0 +1,229 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+
+#include "mpi.h"
+#include <cstring>
+#include <cstdlib>
+#include "fix_aveforce_cuda.h"
+#include "fix_aveforce_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "domain.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixAveForceCuda::FixAveForceCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 6) error->all("Illegal fix aveforce command");
+
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extvector = 1;
+
+  xflag = yflag = zflag = 1;
+  if (strcmp(arg[3],"NULL") == 0) xflag = 0;
+  else xvalue = atof(arg[3]);
+  if (strcmp(arg[4],"NULL") == 0) yflag = 0;
+  else yvalue = atof(arg[4]);
+  if (strcmp(arg[5],"NULL") == 0) zflag = 0;
+  else zvalue = atof(arg[5]);
+
+  // optional args
+
+  iregion = -1;
+
+  int iarg = 6;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"region") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix aveforce command");
+      iregion = domain->find_region(arg[iarg+1]);
+      if (iregion == -1) error->all("Fix aveforce region ID does not exist");
+      iarg += 2;
+    } else error->all("Illegal fix aveforce command");
+
+  }
+  
+  if(iregion!=-1) error->all("Error: fix aveforce/cuda does not currently support 'region' option");
+
+  foriginal_all[0] = foriginal_all[1] = foriginal_all[2] = foriginal_all[3] = 0.0;
+  foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
+  cu_foriginal = NULL;
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixAveForceCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,4);    
+  if (strcmp(update->integrate_style,"respa") == 0)
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+
+  // ncount = total # of atoms in group
+
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::setup(int vflag)
+{
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixAveForceCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else
+  {
+    cuda->cu_f->download();
+    for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) {
+      ((Respa *) update->integrate)->copy_flevel_f(ilevel);
+      post_force_respa(vflag,ilevel,0);
+      ((Respa *) update->integrate)->copy_f_flevel(ilevel);
+    }
+    cuda->cu_f->upload();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::post_force(int vflag)
+{
+  // sum forces on participating atoms
+
+  cu_foriginal->memset_device(0);
+  Cuda_FixAveForceCuda_PostForce_FOrg(&cuda->shared_data, groupbit,(F_FLOAT*) cu_foriginal->dev_data());
+  cu_foriginal->download();
+
+  // average the force on participating atoms
+  // add in requested amount
+
+  MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+  int ncount = static_cast<int> (foriginal_all[3]);
+  if (ncount == 0) return;
+  double fave[3];
+  fave[0] = foriginal_all[0]/ncount + xvalue;
+  fave[1] = foriginal_all[1]/ncount + yvalue;
+  fave[2] = foriginal_all[2]/ncount + zvalue;
+
+  // set force of all participating atoms to same value
+  // only for active dimensions
+
+  Cuda_FixAveForceCuda_PostForce_Set(&cuda->shared_data, groupbit,xflag,yflag,zflag,fave[0],fave[1],fave[2]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  // ave + extra force on outermost level
+  // just ave on inner levels
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+  else {
+    cuda->cu_f->download();
+    cuda->cu_mask->download();
+    double **f = atom->f;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    double foriginal[4];
+    foriginal[0] = foriginal[1] = foriginal[2] = foriginal[3] = 0.0;
+
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	foriginal[0] += f[i][0];
+	foriginal[1] += f[i][1];
+	foriginal[2] += f[i][2];
+	foriginal[3] += 1;
+	
+      }
+
+    MPI_Allreduce(foriginal,foriginal_all,4,MPI_DOUBLE,MPI_SUM,world);
+    int ncount = static_cast<int> (foriginal_all[3]);
+    if (ncount == 0) return;
+    double fave[3];
+    fave[0] = foriginal_all[0]/ncount;
+    fave[1] = foriginal_all[1]/ncount;
+    fave[2] = foriginal_all[2]/ncount;
+
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	if (xflag) f[i][0] = fave[0];
+	if (yflag) f[i][1] = fave[1];
+	if (zflag) f[i][2] = fave[2];
+      }
+    cuda->cu_f->upload();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixAveForceCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixAveForceCuda::compute_vector(int n)
+{
+  return foriginal_all[n];
+}
--- a/src/USER-CUDA/fix_aveforce_cuda.h
+++ b/src/USER-CUDA/fix_aveforce_cuda.h
@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(aveforce/cuda,FixAveForceCuda)
+
+#else
+
+
+#ifndef LMP_FIX_AVE_FORCE_CUDA_H
+#define LMP_FIX_AVE_FORCE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixAveForceCuda : public Fix {
+ public:
+  FixAveForceCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  int xflag,yflag,zflag,iregion;
+  double xvalue,yvalue,zvalue;
+  double foriginal_all[4];
+  double foriginal[4];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int nlevels_respa;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_enforce2d_cuda.cpp
+++ b/src/USER-CUDA/fix_enforce2d_cuda.cpp
@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_enforce2d_cuda.h"
+#include "fix_enforce2d_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "domain.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixEnforce2DCuda::FixEnforce2DCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 3) error->all("Illegal fix enforce2d command");
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixEnforce2DCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::init()
+{
+  if (domain->dimension == 3)
+    error->all("Cannot use fix enforce2d/cuda with 3d simulation");
+  if (atom->omega_flag) 
+    error->warning("Enforce2d/cuda does not support omega_flag on gpu yet. Will be handled on cpu.");
+  	
+  if (atom->angmom_flag)
+    error->warning("Enforce2d/cuda does not support angmom_flag (angular momentum) on gpu yet. Will be handled on cpu.");
+
+  if (atom->torque_flag) 
+    error->warning("Enforce2d/cuda does not support torque_flag on gpu yet. Will be handled on cpu.");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::setup(int vflag)
+{
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixEnforce2dCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    cuda->cu_v->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    cuda->cu_v->download();
+  }
+  else {
+    int nlevels_respa = ((Respa *) update->integrate)->nlevels;
+    for (int ilevel = 0; ilevel < nlevels_respa; ilevel++) {
+      ((Respa *) update->integrate)->copy_flevel_f(ilevel);
+      post_force_respa(vflag,ilevel,0);
+      ((Respa *) update->integrate)->copy_f_flevel(ilevel);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::post_force(int vflag)
+{
+  Cuda_FixEnforce2dCuda_PostForce(&cuda->shared_data, groupbit);
+
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+  
+  if (atom->omega_flag) {
+    double **omega = atom->omega;
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	omega[i][0] = 0.0;
+	omega[i][1] = 0.0;
+      }
+  }
+
+  if (atom->angmom_flag) {
+    double **angmom = atom->angmom;
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	angmom[i][0] = 0.0;
+	angmom[i][1] = 0.0;
+      }
+  }
+
+  if (atom->torque_flag) {
+    double **torque = atom->torque;
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	torque[i][0] = 0.0;
+	torque[i][1] = 0.0;
+      }
+  }  
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixEnforce2DCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
--- a/src/USER-CUDA/fix_enforce2d_cuda.h
+++ b/src/USER-CUDA/fix_enforce2d_cuda.h
@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(enforce2d/cuda,FixEnforce2DCuda)
+
+#else
+
+#ifndef LMP_FIX_ENFORCE2D_CUDA_H
+#define LMP_FIX_ENFORCE2D_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+
+class FixEnforce2DCuda : public Fix {
+ public:
+  FixEnforce2DCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+
+  private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_freeze_cuda.cpp
+++ b/src/USER-CUDA/fix_freeze_cuda.cpp
@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include <cstring>
+#include <cstdlib>
+#include "fix_freeze_cuda.h"
+#include "fix_freeze_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "memory.h"
+#include "modify.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixFreezeCuda::FixFreezeCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 3) error->all("Illegal fix freeze command");
+
+  if (!atom->torque_flag)
+    error->all("Fix freeze requires atom attribute torque");
+
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extvector = 1;
+
+
+
+  force_flag = 0;
+  foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
+  cu_foriginal=NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixFreezeCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixFreezeCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,3);    
+  int count = 0;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"freeze") == 0) count++;
+  if (count > 1) error->all("More than one fix freeze");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixFreezeCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixFreezeCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixFreezeCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+
+  MYDBG( printf("# CUDA: FixFreezeCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ---------------------------------------------------------------------- */
+
+void FixFreezeCuda::post_force(int vflag)
+{
+  MYDBG( printf("# CUDA: FixFreezeCuda::postforce start\n"); )
+  force_flag = 0;
+  cu_foriginal->memset_device(0);
+  Cuda_FixFreezeCuda_PostForce(&cuda->shared_data, groupbit, (F_FLOAT*) cu_foriginal->dev_data());
+  cu_foriginal->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixFreezeCuda::compute_vector(int n)
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[n+1];
+}
--- a/src/USER-CUDA/fix_freeze_cuda.h
+++ b/src/USER-CUDA/fix_freeze_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(freeze/cuda,FixFreezeCuda)
+
+#else
+
+#ifndef LMP_FIX_FREEZE_CUDA_H
+#define LMP_FIX_FREEZE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixFreezeCuda : public Fix {
+ public:
+  FixFreezeCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void post_force(int);
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  double foriginal[3],foriginal_all[3];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int force_flag;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_gravity_cuda.cpp
+++ b/src/USER-CUDA/fix_gravity_cuda.cpp
@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+#include "fix_gravity_cuda.h"
+#include "fix_gravity_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "domain.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+enum{CHUTE,SPHERICAL,GRADIENT,VECTOR};
+
+/* ---------------------------------------------------------------------- */
+
+FixGravityCuda::FixGravityCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 5) error->all("Illegal fix gravity command");
+
+  time_depend = 1;
+
+  magnitude = atof(arg[3]);
+
+  if (strcmp(arg[4],"chute") == 0) {
+    if (narg != 6) error->all("Illegal fix gravity command");
+    style = CHUTE;
+    phi = 0.0;
+    theta = 180.0 - atof(arg[5]);
+  } else if (strcmp(arg[4],"spherical") == 0) {
+    if (narg != 7) error->all("Illegal fix gravity command");
+    style = SPHERICAL;
+    phi = atof(arg[5]);
+    theta = atof(arg[6]);
+  } else if (strcmp(arg[4],"gradient") == 0) {
+    if (narg != 9) error->all("Illegal fix gravity command");
+    style = GRADIENT;
+    phi = atof(arg[5]);
+    theta = atof(arg[6]);
+    phigrad = atof(arg[7]);
+    thetagrad = atof(arg[8]);
+  } else if (strcmp(arg[4],"vector") == 0) {
+    if (narg != 8) error->all("Illegal fix gravity command");
+    style = VECTOR;
+    xdir = atof(arg[5]);
+    ydir = atof(arg[6]);
+    zdir = atof(arg[7]);
+  } else error->all("Illegal fix gravity command");
+
+  double PI = 4.0*atan(1.0);
+  degree2rad = PI/180.0;
+
+  if (style == CHUTE || style == SPHERICAL || style == GRADIENT) {
+    if (domain->dimension == 3) {
+      xgrav = sin(degree2rad * theta) * cos(degree2rad * phi);
+      ygrav = sin(degree2rad * theta) * sin(degree2rad * phi);
+      zgrav = cos(degree2rad * theta);
+    } else {
+      xgrav = sin(degree2rad * theta);
+      ygrav = cos(degree2rad * theta);
+      zgrav = 0.0;
+    }
+  } else if (style == VECTOR) {
+    if (domain->dimension == 3) {
+      double length = sqrt(xdir*xdir + ydir*ydir + zdir*zdir);
+      xgrav = xdir/length;
+      ygrav = ydir/length;
+      zgrav = zdir/length;
+    } else {
+      double length = sqrt(xdir*xdir + ydir*ydir);
+      xgrav = xdir/length;
+      ygrav = ydir/length;
+      zgrav = 0.0;
+    }
+  }
+
+  time_origin = update->ntimestep;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixGravityCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityCuda::init()
+{
+  dt = update->dt;
+
+  xacc = magnitude*xgrav;
+  yacc = magnitude*ygrav;
+  zacc = magnitude*zgrav;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixGravityCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixGravityCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else {
+  }
+  MYDBG( printf("# CUDA: FixGravityCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixGravityCuda::post_force(int vflag)
+{
+  // update direction of gravity vector if gradient style
+
+  if (style == GRADIENT) {
+    if (domain->dimension == 3) {
+      double phi_current = degree2rad * 
+	(phi + (update->ntimestep - time_origin)*dt*phigrad*360.0);
+      double theta_current = degree2rad * 
+	(theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0);
+      xgrav = sin(theta_current) * cos(phi_current);
+      ygrav = sin(theta_current) * sin(phi_current);
+      zgrav = cos(theta_current);
+    } else {
+      double theta_current = degree2rad * 
+	(theta + (update->ntimestep - time_origin)*dt*thetagrad*360.0);
+      xgrav = sin(theta_current);
+      ygrav = cos(theta_current);
+    }
+    xacc = magnitude*xgrav;
+    yacc = magnitude*ygrav;
+    zacc = magnitude*zgrav;
+  }
+
+  MYDBG( printf("# CUDA: FixGravityCuda::postforce start\n"); )
+  Cuda_FixGravityCuda_PostForce(&cuda->shared_data, groupbit, xacc,yacc,zacc);
+}
+
+
--- a/src/USER-CUDA/fix_gravity_cuda.h
+++ b/src/USER-CUDA/fix_gravity_cuda.h
@ -0,0 +1,60 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(gravity/cuda,FixGravityCuda)
+
+#else
+
+#ifndef LMP_FIX_GRAVITY_CUDA_H
+#define LMP_FIX_GRAVITY_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixGravityCuda : public Fix {
+ public:
+  FixGravityCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void post_force(int);
+
+ private:
+  class Cuda *cuda;
+  int style;
+  double magnitude,dt;
+  double phi,theta,phigrad,thetagrad;
+  double xdir,ydir,zdir;
+  double xgrav,ygrav,zgrav,xacc,yacc,zacc;
+  double degree2rad;
+  int time_origin;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_nh_cuda.cpp
+++ b/src/USER-CUDA/fix_nh_cuda.cpp
--- a/src/USER-CUDA/fix_nh_cuda.h
+++ b/src/USER-CUDA/fix_nh_cuda.h
@ -0,0 +1,126 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_FIX_NH_CUDA_H
+#define LMP_FIX_NH_CUDA_H
+
+#include "fix.h"
+#include "cuda_precision.h"
+
+namespace LAMMPS_NS {
+
+class FixNHCuda : public Fix {
+ public:
+  FixNHCuda(class LAMMPS *, int, char **);
+  virtual ~FixNHCuda();
+  int setmask();
+  virtual void init();
+  void setup(int);
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+  void initial_integrate_respa(int, int, int);
+  void final_integrate_respa(int, int);
+  double compute_scalar();
+  double compute_vector(int);
+  void write_restart(FILE *);
+  void restart(char *);
+  int modify_param(int, char **);
+  void reset_dt();
+
+ protected:
+  class Cuda *cuda;
+  int dimension,which;
+  double dtv,dtf,dthalf,dt4,dt8,dto;
+  double boltz,nktv2p,tdof;
+  double vol0,t0;
+
+  double t_start,t_stop;
+  double t_current,t_target;
+  double t_freq;
+
+  int tstat_flag;                   // 1 if control T
+  int pstat_flag;                   // 1 if control P
+
+  int pstyle,pcouple,allremap;
+  int p_flag[6];                   // 1 if control P on this dim, 0 if not
+  double p_start[6],p_stop[6];
+  double p_freq[6],p_target[6];
+  double omega[6],omega_dot[6];
+  double omega_mass[6];
+  double p_current[6],dilation[6];
+  double drag,tdrag_factor;        // drag factor on particle thermostat
+  double pdrag_factor;             // drag factor on barostat
+  double factor[6];                // velocity scaling due to barostat
+  int kspace_flag;                 // 1 if KSpace invoked, 0 if not
+  int nrigid;                      // number of rigid fixes
+  int *rfix;                       // indices of rigid fixes
+
+  int nlevels_respa;
+  double *step_respa;
+
+  char *id_temp,*id_press;
+  class Compute *temperature,*pressure;
+  int tflag,pflag;
+
+  double *eta,*eta_dot;            // chain thermostat for particles
+  double *eta_dotdot;
+  double *eta_mass;
+  int mtchain;                     // length of chain
+                                   
+  double *etap;                    // chain thermostat for barostat
+  double *etap_dot;
+  double *etap_dotdot;
+  double *etap_mass;
+  int mpchain;                     // length of chain
+                                   
+  int mtk_flag;                    // 0 if using Hoover barostat
+  double mtk_term1,mtk_term2;
+  int mtchain_default_flag;
+  int pdim;                        // number of barostatted dims
+  double mvv_current[3];           // diagonal of KE tensor
+  double mtk_factor;               // MTK factor
+  double p_freq_max;               // maximum barostat frequency
+
+  double p_hydro;                  // hydrostatic target pressure
+
+  int nc_tchain,nc_pchain;
+  double factor_eta;
+  double sigma[6];                 // scaled target stress
+  double fdev[6];                  // deviatoric force on barostat
+  int deviatoric_flag;             // 0 if target stress tensor is hydrostatic
+  double h0_inv[6];                // h_inv of reference (zero strain) box
+  int nreset_h0;                   // interval for resetting h0
+
+  void couple();
+  void couple_ke();
+  void remap();
+  void nhc_temp_integrate();
+  void nhc_press_integrate();
+
+  virtual void nve_x();            // may be overwritten by child classes
+  virtual void nve_v();
+  virtual void nh_v_press();
+  virtual void nh_v_temp();
+
+  void compute_sigma();
+  void compute_deviatoric();
+  double compute_strain_energy();
+  void compute_press_target();
+  void nh_omega_dot();
+  
+  X_FLOAT triggerneighsq;
+};
+
+}
+
+#endif
--- a/src/USER-CUDA/fix_npt_cuda.cpp
+++ b/src/USER-CUDA/fix_npt_cuda.cpp
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_npt_cuda.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixNPTCuda::FixNPTCuda(LAMMPS *lmp, int narg, char **arg) :
+  FixNHCuda(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (!tstat_flag)
+    error->all("Temperature control must be used with fix npt");
+  if (!pstat_flag)
+    error->all("Pressure control must be used with fix npt");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+  // compute group = all since pressure is always global (group all)
+  // and thus its KE/temperature contribution should use group all
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+  
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "temp/cuda";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+
+  // create a new compute pressure style
+  // id = fix-ID + press, compute group = all
+  // pass id_temp as 4th arg to pressure constructor
+
+  n = strlen(id) + 7;
+  id_press = new char[n];
+  strcpy(id_press,id);
+  strcat(id_press,"_press");
+  
+  newarg = new char*[4];
+  newarg[0] = id_press;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "pressure/cuda";
+  newarg[3] = id_temp;
+  modify->add_compute(4,newarg);
+  delete [] newarg;
+  pflag = 1;
+}
--- a/src/USER-CUDA/fix_npt_cuda.h
+++ b/src/USER-CUDA/fix_npt_cuda.h
@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(npt/cuda,FixNPTCuda)
+
+#else
+
+#ifndef LMP_FIX_NPTCuda_H
+#define LMP_FIX_NPTCuda_H
+
+#include "fix_nh_cuda.h"
+
+namespace LAMMPS_NS {
+
+class FixNPTCuda : public FixNHCuda {
+ public:
+  FixNPTCuda(class LAMMPS *, int, char **);
+  ~FixNPTCuda() {}
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_nve_cuda.cpp
+++ b/src/USER-CUDA/fix_nve_cuda.cpp
@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstring>
+#include "fix_nve_cuda.h"
+#include "fix_nve_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVECuda::FixNVECuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (strcmp(style,"nve/sphere") != 0 && narg < 3)
+		error->all("Illegal fix nve command");
+	
+	time_integrate = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixNVECuda::setmask()
+{
+	int mask = 0;
+	mask |= INITIAL_INTEGRATE_CUDA;
+	mask |= FINAL_INTEGRATE_CUDA;
+	// mask |= INITIAL_INTEGRATE_RESPA_CUDA;
+	// mask |= FINAL_INTEGRATE_RESPA_CUDA;
+	return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::init()
+{
+	dtv = update->dt;
+	dtf = 0.5 * update->dt * force->ftm2v;
+	
+	if (strcmp(update->integrate_style,"respa") == 0)
+		step_respa = ((Respa *) update->integrate)->step;
+		
+	triggerneighsq= cuda->shared_data.atom.triggerneighsq;
+    cuda->neighbor_decide_by_integrator=1;
+    Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
+    
+}
+
+/* ----------------------------------------------------------------------
+   allow for both per-type and per-atom mass
+------------------------------------------------------------------------- */
+
+void FixNVECuda::initial_integrate(int vflag)
+{
+	if(triggerneighsq!=cuda->shared_data.atom.triggerneighsq) 
+	{
+		triggerneighsq= cuda->shared_data.atom.triggerneighsq;
+		Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
+	}
+	int nlocal = atom->nlocal;
+	if(igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+    Cuda_FixNVECuda_InitialIntegrate(& cuda->shared_data, groupbit,nlocal);	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::final_integrate()
+{
+	int nlocal = atom->nlocal;
+	if(igroup == atom->firstgroup) nlocal = atom->nfirst;
+	
+	Cuda_FixNVECuda_FinalIntegrate(& cuda->shared_data, groupbit,nlocal);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::initial_integrate_respa(int vflag, int ilevel, int flag)
+{
+	//this point should not be reached yet since RESPA is not supported
+	if (flag) return;             // only used by NPT,NPH
+	
+	dtv = step_respa[ilevel];
+	dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
+	
+	// innermost level - NVE update of v and x
+	// all other levels - NVE update of v
+	
+	if(ilevel == 0) initial_integrate(vflag);
+	else final_integrate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::final_integrate_respa(int ilevel, int iloop)
+{
+	//this point should not be reached yet since RESPA is not supported
+	dtf = 0.5 * step_respa[ilevel] * force->ftm2v;
+	final_integrate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVECuda::reset_dt()
+{
+	dtv = update->dt;
+	dtf = 0.5 * update->dt * force->ftm2v;
+	Cuda_FixNVECuda_Init(&cuda->shared_data,dtv,dtf);
+}
--- a/src/USER-CUDA/fix_nve_cuda.h
+++ b/src/USER-CUDA/fix_nve_cuda.h
@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/cuda,FixNVECuda)
+
+#else
+
+#ifndef LMP_FIX_NVE_CUDA_H
+#define LMP_FIX_NVE_CUDA_H
+
+#include "fix.h"
+#include "cuda_precision.h"
+
+namespace LAMMPS_NS {
+
+class FixNVECuda : public Fix
+{
+	public:
+		FixNVECuda(class LAMMPS *, int, char **);
+		int setmask();
+		virtual void init();
+		virtual void initial_integrate(int);
+		virtual void final_integrate();
+		void initial_integrate_respa(int, int, int);
+		void final_integrate_respa(int, int);
+		void reset_dt();
+	
+		X_FLOAT triggerneighsq;
+		
+	protected:
+		class Cuda *cuda;
+		double dtv, dtf;
+		double *step_respa;
+		int mass_require;
+		
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_nvt_cuda.cpp
+++ b/src/USER-CUDA/fix_nvt_cuda.cpp
@ -0,0 +1,48 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_nvt_cuda.h"
+#include "group.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVTCuda::FixNVTCuda(LAMMPS *lmp, int narg, char **arg) :
+  FixNHCuda(lmp, narg, arg)
+{
+  if (!tstat_flag)
+    error->all("Temperature control must be used with fix nvt");
+  if (pstat_flag)
+    error->all("Pressure control can not be used with fix nvt");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+  
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+}
--- a/src/USER-CUDA/fix_nvt_cuda.h
+++ b/src/USER-CUDA/fix_nvt_cuda.h
@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nvt/cuda,FixNVTCuda)
+
+#else
+
+#ifndef LMP_FIX_NVTCuda_H
+#define LMP_FIX_NVTCuda_H
+
+#include "fix_nh_cuda.h"
+
+namespace LAMMPS_NS {
+
+class FixNVTCuda : public FixNHCuda {
+ public:
+  FixNVTCuda(class LAMMPS *, int, char **);
+  ~FixNVTCuda() {}
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_set_force_cuda.cpp
+++ b/src/USER-CUDA/fix_set_force_cuda.cpp
@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#include <cstring>
+#include <cstdlib>
+#include "fix_set_force_cuda.h"
+#include "fix_set_force_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda.h"
+#include "memory.h"
+#include "cuda_modify_flags.h"
+
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixSetForceCuda::FixSetForceCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+  
+  if (narg != 6) error->all("Illegal fix setforce/cuda command");
+
+  vector_flag = 1;
+  size_vector = 3;
+  global_freq = 1;
+  extvector = 1;
+
+  flagx = flagy = flagz = 1;
+  if (strcmp(arg[3],"NULL") == 0) flagx = 0;
+  else xvalue = atof(arg[3]);
+  if (strcmp(arg[4],"NULL") == 0) flagy = 0;
+  else yvalue = atof(arg[4]);
+  if (strcmp(arg[5],"NULL") == 0) flagz = 0;
+  else zvalue = atof(arg[5]);
+
+  force_flag = 0;
+  foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
+  cu_foriginal=NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixSetForceCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  mask |= POST_FORCE_RESPA;
+  mask |= MIN_POST_FORCE_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::init()
+{
+  if(not cu_foriginal)
+  cu_foriginal = new cCudaData<double, F_FLOAT, x> (foriginal,3);    
+  if (strcmp(update->integrate_style,"respa") == 0)
+    nlevels_respa = ((Respa *) update->integrate)->nlevels;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::setup(int vflag)
+{
+  MYDBG( printf("# CUDA: FixSetForceCuda::setup\n"); )
+	
+  if (strcmp(update->integrate_style,"verlet") == 0)
+  {
+    Cuda_FixSetForceCuda_Init(&cuda->shared_data);
+    cuda->cu_f->upload();
+    post_force(vflag);
+    cuda->cu_f->download();
+    
+  }
+  else {
+    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
+    cuda->cu_f->download();
+    post_force_respa(vflag,nlevels_respa-1,0);
+    cuda->cu_f->upload();
+    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
+  }
+  MYDBG( printf("# CUDA: FixSetForceCuda::setup done\n"); )
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::min_setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::post_force(int vflag)
+{
+  MYDBG( printf("# CUDA: FixSetForceCuda::postforce start\n"); )
+  force_flag = 0;
+  cu_foriginal->memset_device(0);
+  Cuda_FixSetForceCuda_PostForce(&cuda->shared_data, groupbit, xvalue, yvalue,zvalue,(F_FLOAT*) cu_foriginal->dev_data(),flagx,flagy,flagz);
+  cu_foriginal->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::post_force_respa(int vflag, int ilevel, int iloop)
+{
+  if (ilevel == nlevels_respa-1) post_force(vflag);
+  else {
+  	cuda->cu_f->download();
+  	cuda->cu_mask->download();
+  	
+    double **f = atom->f;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    foriginal[0] = foriginal[1] = foriginal[2] = 0.0;
+    force_flag = 0;
+    
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	foriginal[0] += f[i][0];
+	foriginal[1] += f[i][1];
+	foriginal[2] += f[i][2];
+	if (flagx) f[i][0] = 0.0;
+	if (flagy) f[i][1] = 0.0;
+	if (flagz) f[i][2] = 0.0;
+      }
+  	cuda->cu_f->upload();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixSetForceCuda::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+
+/* ----------------------------------------------------------------------
+   return components of total force on fix group before force was changed
+------------------------------------------------------------------------- */
+
+double FixSetForceCuda::compute_vector(int n)
+{
+  // only sum across procs one time
+
+  if (force_flag == 0) {
+    MPI_Allreduce(foriginal,foriginal_all,3,MPI_DOUBLE,MPI_SUM,world);
+    force_flag = 1;
+  }
+  return foriginal_all[n+1];
+}
--- a/src/USER-CUDA/fix_set_force_cuda.h
+++ b/src/USER-CUDA/fix_set_force_cuda.h
@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(setforce/cuda,FixSetForceCuda)
+
+#else
+
+#ifndef LMP_FIX_SET_FORCE_CUDA_H
+#define LMP_FIX_SET_FORCE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixSetForceCuda : public Fix {
+ public:
+  FixSetForceCuda(class LAMMPS *, int, char **);
+  int setmask();
+  void init();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  void post_force_respa(int, int, int);
+  void min_post_force(int);
+  double compute_vector(int);
+
+ private:
+  class Cuda *cuda;
+  int flagx,flagy,flagz; 
+  double xvalue,yvalue,zvalue;
+  double foriginal[3],foriginal_all[3];
+  cCudaData<double     , F_FLOAT   		, x>* cu_foriginal;	
+  int force_flag;
+  int nlevels_respa;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_shake_cuda.cpp
+++ b/src/USER-CUDA/fix_shake_cuda.cpp
--- a/src/USER-CUDA/fix_shake_cuda.h
+++ b/src/USER-CUDA/fix_shake_cuda.h
@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(shake/cuda,FixShakeCuda)
+
+#else
+
+#ifndef LMP_FIX_SHAKE_CUDA_H
+#define LMP_FIX_SHAKE_CUDA_H
+
+#include "fix.h"
+#include "cuda_data.h"
+#include "cuda_precision.h"
+
+namespace LAMMPS_NS {
+
+class FixShakeCuda : public Fix {
+ public:
+  FixShakeCuda(class LAMMPS *, int, char **);
+  ~FixShakeCuda();
+  int setmask();
+  void init();
+  void setup(int);
+  void pre_neighbor();
+  void post_force(int);
+  //void post_force_respa(int, int, int);
+
+  double memory_usage();
+  void grow_arrays(int);
+  void copy_arrays(int, int);
+  void set_arrays(int);
+  int pack_exchange(int, double *);
+  int unpack_exchange(int, double *);
+  int pack_comm(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+
+  int dof(int);
+  void reset_dt();
+
+  double time_postforce;
+ private:
+  class Cuda *cuda;
+  int me,nprocs;
+  double PI;
+  double tolerance;                      // SHAKE tolerance
+  int max_iter;                          // max # of SHAKE iterations
+  int output_every;                      // SHAKE stat output every so often
+  int next_output;                       // timestep for next output
+
+                                         // settings from input command
+  int *bond_flag,*angle_flag;            // bond/angle types to constrain
+  int *type_flag;                        // constrain bonds to these types
+  double *mass_list;                     // constrain bonds to these masses
+  int nmass;                             // # of masses in mass_list
+  bool neighbor_step;					 // was neighboring done in this step -> need to run the Cuda_FixShake_Init
+
+  double *bond_distance,*angle_distance; // constraint distances
+  cCudaData<double 	  , X_FLOAT , xx >* cu_bond_distance;
+  cCudaData<double 	  , X_FLOAT , xx >* cu_angle_distance;
+
+  int ifix_respa;                        // rRESPA fix needed by SHAKE
+  int nlevels_respa;                     // copies of needed rRESPA variables
+  int *loop_respa;
+  double *step_respa;
+
+  double **x,**v,**f;                    // local ptrs to atom class quantities
+  double *mass,*rmass;
+  int *type;
+  int nlocal;
+                                         // atom-based arrays
+  int *shake_flag;                       // 0 if atom not in SHAKE cluster
+                                         // 1 = size 3 angle cluster
+                                         // 2,3,4 = size of bond-only cluster
+  int **shake_atom;                      // global IDs of atoms in cluster
+                                         // central atom is 1st
+                                         // lowest global ID is 1st for size 2
+                                            
+  int **shake_type;                      // bondtype of each bond in cluster
+                                         // for angle cluster, 3rd value
+                                         //   is angletype
+  double **xshake;                       // unconstrained atom coords
+  cCudaData<int 	  , int	    , xx >* cu_shake_flag;
+  cCudaData<int 	  , int	    , yx >* cu_shake_atom;
+  cCudaData<int 	  , int	    , yx >* cu_shake_type;
+  cCudaData<double 	  , X_FLOAT , xy >* cu_xshake;
+  cCudaData<int 	  , int	    , xx >* cu_list;
+  cCudaData<double 	  , ENERGY_FLOAT , xx >* cu_virial;
+  int* countoccur;
+
+  int vflag;                            // virial flag
+  double dtv,dtfsq;                     // timesteps for trial move
+  double dtf_inner,dtf_innerhalf;       // timesteps for rRESPA trial move
+
+  int *list;                            // list of clusters to SHAKE
+  int nlist,maxlist;                    // size and max-size of list
+
+                                        // stat quantities
+  int *b_count,*b_count_all;            // counts for each bond type
+  double *b_ave,*b_max,*b_min;          // ave/max/min dist for each bond type
+  double *b_ave_all,*b_max_all,*b_min_all;   // MPI summing arrays
+  int *a_count,*a_count_all;            // ditto for angle types
+  double *a_ave,*a_max,*a_min;
+  double *a_ave_all,*a_max_all,*a_min_all;
+
+  void find_clusters();
+  void swap_clusters(int i,int j);
+  int masscheck(double);
+  void unconstrained_update();
+  void shake2(int);
+  void shake3(int);
+  void shake4(int);
+  void shake3angle(int);
+  void stats();
+  int bondfind(int, int, int);
+  int anglefind(int, int, int);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_temp_berendsen_cuda.cpp
+++ b/src/USER-CUDA/fix_temp_berendsen_cuda.cpp
@ -0,0 +1,220 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include "fix_temp_berendsen_cuda.h"
+#include "fix_temp_berendsen_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "group.h"
+#include "update.h"
+#include "comm.h"
+#include "modify.h"
+#include "compute.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+enum{NOBIAS,BIAS};
+
+/* ---------------------------------------------------------------------- */
+
+FixTempBerendsenCuda::FixTempBerendsenCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg != 6) error->all("Illegal fix temp/berendsen/cuda command");
+
+  // Berendsen thermostat should be applied every step
+
+  nevery = 1;
+
+  t_start = atof(arg[3]);
+  t_stop = atof(arg[4]);
+  t_period = atof(arg[5]);
+
+  // error checks
+
+  if (t_period <= 0.0) error->all("Fix temp/berendsen/cuda period must be > 0.0");
+
+  // create a new compute temp style
+  // id = fix-ID + temp, compute group = fix group
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixTempBerendsenCuda::~FixTempBerendsenCuda()
+{
+  // delete temperature if fix created it
+
+  if (tflag) modify->delete_compute(id_temp);
+  delete [] id_temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempBerendsenCuda::setmask()
+{
+  int mask = 0;
+  mask |= END_OF_STEP_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempBerendsenCuda::init()
+{
+  int icompute = modify->find_compute(id_temp);
+  if (icompute < 0) 
+    error->all("Temperature ID for fix temp/berendsen/cuda does not exist");
+  temperature = modify->compute[icompute];
+  if(not temperature->cudable) 
+	error->warning("Fix temp/berendsen/cuda uses non cudable temperature compute");
+  if (temperature->tempbias) which = BIAS;
+  else which = NOBIAS;
+
+  //temperature->init();        //not in original berendsen possible error?
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempBerendsenCuda::end_of_step()
+{
+  double t_current;
+  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
+  t_current = temperature->compute_scalar();
+  if (t_current == 0.0)
+    error->all("Computed temperature for fix temp/berendsen/cuda cannot be 0.0");
+
+  double delta = update->ntimestep - update->beginstep;
+  delta /= update->endstep - update->beginstep;
+  t_target = t_start + delta * (t_stop-t_start);
+
+  // rescale velocities by lamda
+
+  double lamda = sqrt(1.0 + update->dt/t_period*(t_target/t_current - 1.0));
+
+  double **v = atom->v;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  if (which == NOBIAS) {
+	Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda);
+
+    } else {
+      if(not temperature->cudable)
+      {
+      	cuda->cu_x->download();cuda->cu_v->download();
+      for (int i = 0; i < nlocal; i++) {
+	if (mask[i] & groupbit) {
+	  temperature->remove_bias(i,v[i]);
+ 	  v[i][0] *= lamda;
+	  v[i][1] *= lamda;
+	  v[i][2] *= lamda;
+	  temperature->restore_bias(i,v[i]);
+	}
+        }
+	  cuda->cu_v->upload();
+      }
+      else
+	  {
+  	    temperature->remove_bias_all();
+	    Cuda_FixTempBerendsenCuda_EndOfStep(&cuda->shared_data, groupbit,lamda);
+	    temperature->restore_bias_all();
+	  }
+    }
+
+  
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempBerendsenCuda::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"temp") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (tflag) {
+      modify->delete_compute(id_temp);
+      tflag = 0;
+    }
+    delete [] id_temp;
+    int n = strlen(arg[1]) + 1;
+    id_temp = new char[n];
+    strcpy(id_temp,arg[1]);
+
+    int icompute = modify->find_compute(id_temp);
+    if (icompute < 0) error->all("Could not find fix_modify temperature ID");
+    temperature = modify->compute[icompute];
+
+    if (temperature->tempflag == 0)
+      error->all("Fix_modify temperature ID does not compute temperature");
+    if (temperature->igroup != igroup && comm->me == 0)
+      error->warning("Group for fix_modify temp != fix group");
+    return 2;
+  }
+  return 0;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempBerendsenCuda::reset_target(double t_new)
+{
+  t_start = t_stop = t_new;
+}
+
+
+
--- a/src/USER-CUDA/fix_temp_berendsen_cuda.h
+++ b/src/USER-CUDA/fix_temp_berendsen_cuda.h
@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+#ifdef FIX_CLASS
+
+FixStyle(temp/berendsen/cuda,FixTempBerendsenCuda)
+
+#else
+
+#ifndef LMP_FIX_TEMP_BERENDSEN_CUDA_H
+#define LMP_FIX_TEMP_BERENDSEN_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+class FixTempBerendsenCuda : public Fix {
+ public:
+  FixTempBerendsenCuda(class LAMMPS *, int, char **);
+  ~FixTempBerendsenCuda();
+  int setmask();
+  void init();
+  void end_of_step();
+  int modify_param(int, char **);
+  void reset_target(double);
+
+ private:
+  class Cuda *cuda;
+  int which;
+  double t_start,t_stop,t_target,t_period;
+
+  char *id_temp;
+  class Compute *temperature;
+  int tflag;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_temp_rescale_cuda.cpp
+++ b/src/USER-CUDA/fix_temp_rescale_cuda.cpp
@ -0,0 +1,222 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include "fix_temp_rescale_cuda.h"
+#include "fix_temp_rescale_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "group.h"
+#include "update.h"
+#include "domain.h"
+#include "region.h"
+#include "comm.h"
+#include "modify.h"
+#include "compute.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+
+enum{NOBIAS,BIAS};
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleCuda::FixTempRescaleCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 8) error->all("Illegal fix temp/rescale/cuda command");
+
+  nevery = atoi(arg[3]);
+  if (nevery <= 0) error->all("Illegal fix temp/rescale/cuda command");
+
+  scalar_flag = 1;
+  global_freq = nevery;
+  extscalar = 1;
+
+  t_start = atof(arg[4]);
+  t_stop = atof(arg[5]);
+  t_window = atof(arg[6]);
+  fraction = atof(arg[7]);
+
+  // create a new compute temp
+  // id = fix-ID + temp, compute group = fix group
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[6];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+
+  energy = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleCuda::~FixTempRescaleCuda()
+{
+  // delete temperature if fix created it
+
+  if (tflag) modify->delete_compute(id_temp);
+  delete [] id_temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleCuda::setmask()
+{
+  int mask = 0;
+  mask |= END_OF_STEP_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleCuda::init()
+{
+  int icompute = modify->find_compute(id_temp);
+  if (icompute < 0) 
+    error->all("Temperature ID for fix temp/rescale/cuda does not exist");
+  temperature = modify->compute[icompute];
+  if(not temperature->cudable) 
+	error->warning("Fix temp/rescale/cuda uses non cudable temperature compute");
+  if (temperature->tempbias) which = BIAS;
+  else which = NOBIAS;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleCuda::end_of_step()
+{
+  double t_current;
+  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
+  t_current = temperature->compute_scalar();
+  if (t_current == 0.0)
+    error->all("Computed temperature for fix temp/rescale/cuda cannot be 0.0");
+
+  double delta = update->ntimestep - update->beginstep;
+  delta /= update->endstep - update->beginstep;
+  double t_target = t_start + delta * (t_stop-t_start);
+
+  // rescale velocity of appropriate atoms if outside window
+
+  if (fabs(t_current-t_target) > t_window) {
+    t_target = t_current - fraction*(t_current-t_target);
+    double factor = sqrt(t_target/t_current);
+    double efactor = 0.5 * force->boltz * temperature->dof;
+
+    double **v = atom->v;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    if (which == NOBIAS) {
+      energy += (t_current-t_target) * efactor;
+
+	Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor);
+
+    } else if (which == BIAS) {
+      energy += (t_current-t_target) * efactor;
+      if(not temperature->cudable)
+      {
+      	cuda->cu_x->download();cuda->cu_v->download();
+      for (int i = 0; i < nlocal; i++) {
+	if (mask[i] & groupbit) {
+	  temperature->remove_bias(i,v[i]);
+	  v[i][0] *= factor;
+	  v[i][1] *= factor;
+	  v[i][2] *= factor;
+	  temperature->restore_bias(i,v[i]);
+	}
+        }
+	  cuda->cu_v->upload();
+      }
+      else
+      {
+	    temperature->remove_bias_all();
+	    Cuda_FixTempRescaleCuda_EndOfStep(&cuda->shared_data, groupbit,factor);
+	    temperature->restore_bias_all();
+      }
+    }
+
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleCuda::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"temp") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (tflag) {
+      modify->delete_compute(id_temp);
+      tflag = 0;
+    }
+    delete [] id_temp;
+    int n = strlen(arg[1]) + 1;
+    id_temp = new char[n];
+    strcpy(id_temp,arg[1]);
+
+    int icompute = modify->find_compute(id_temp);
+    if (icompute < 0) error->all("Could not find fix_modify temperature ID");
+    temperature = modify->compute[icompute];
+
+    if (temperature->tempflag == 0)
+      error->all("Fix_modify temperature ID does not compute temperature");
+    if (temperature->igroup != igroup && comm->me == 0)
+      error->warning("Group for fix_modify temp != fix group");
+    if(not temperature->cudable) 
+	  error->warning("Fix temp/rescale/cuda uses non cudable temperature compute");
+    return 2;
+  }
+  return 0;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleCuda::reset_target(double t_new)
+{
+  t_start = t_stop = t_new;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixTempRescaleCuda::compute_scalar()
+{
+  return energy;
+}
--- a/src/USER-CUDA/fix_temp_rescale_cuda.h
+++ b/src/USER-CUDA/fix_temp_rescale_cuda.h
@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(temp/rescale/cuda,FixTempRescaleCuda)
+
+#else
+
+#ifndef FIX_TEMP_RESCALE_CUDA_H
+#define FIX_TEMP_RESCALE_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+class FixTempRescaleCuda : public Fix {
+ public:
+  FixTempRescaleCuda(class LAMMPS *, int, char **);
+  ~FixTempRescaleCuda();
+  int setmask();
+  void init();
+  void end_of_step();
+  int modify_param(int, char **);
+  void reset_target(double);
+  double compute_scalar();
+
+ private:
+  class Cuda *cuda;
+  int which;
+  double t_start,t_stop,t_window;
+  double fraction,energy,efactor;
+
+  char *id_temp;
+  class Compute *temperature;
+  int tflag;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp
+++ b/src/USER-CUDA/fix_temp_rescale_limit_cuda.cpp
@ -0,0 +1,237 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include <cstdlib>
+#include <cmath>
+#include "fix_temp_rescale_limit_cuda.h"
+#include "fix_temp_rescale_limit_cuda_cu.h"
+#include "atom.h"
+#include "force.h"
+#include "group.h"
+#include "update.h"
+#include "domain.h"
+#include "region.h"
+#include "comm.h"
+#include "modify.h"
+#include "compute.h"
+#include "error.h"
+#include "cuda.h"
+#include "cuda_modify_flags.h"
+
+using namespace LAMMPS_NS;
+#define MIN(A,B) ((A) < (B)) ? (A) : (B)
+#define MAX(A,B) ((A) > (B)) ? (A) : (B)
+
+enum{NOBIAS,BIAS};
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleLimitCuda::FixTempRescaleLimitCuda(LAMMPS *lmp, int narg, char **arg) :
+  Fix(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  if (narg < 9) error->all("Illegal fix temp/rescale/limit/cuda command");
+
+  nevery = atoi(arg[3]);
+  if (nevery <= 0) error->all("Illegal fix temp/rescale/limit/cuda command");
+
+  scalar_flag = 1;
+  global_freq = nevery;
+  extscalar = 1;
+
+  t_start = atof(arg[4]);
+  t_stop = atof(arg[5]);
+  t_window = atof(arg[6]);
+  fraction = atof(arg[7]);
+  limit = atof(arg[8]);
+  if (limit <= 1.0) error->all("Illegal fix temp/rescale/limit/cuda command (limit must be > 1.0)");
+  
+
+  // create a new compute temp
+  // id = fix-ID + temp, compute group = fix group
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[6];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp/cuda";
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+
+  energy = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixTempRescaleLimitCuda::~FixTempRescaleLimitCuda()
+{
+  // delete temperature if fix created it
+
+  if (tflag) modify->delete_compute(id_temp);
+  delete [] id_temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleLimitCuda::setmask()
+{
+  int mask = 0;
+  mask |= END_OF_STEP_CUDA;
+  mask |= THERMO_ENERGY_CUDA;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleLimitCuda::init()
+{
+  int icompute = modify->find_compute(id_temp);
+  if (icompute < 0) 
+    error->all("Temperature ID for fix temp/rescale/limit/cuda does not exist");
+  temperature = modify->compute[icompute];
+  if(not temperature->cudable) 
+	error->warning("Fix temp/rescale/limit/cuda uses non cudable temperature compute");
+  if (temperature->tempbias) which = BIAS;
+  else which = NOBIAS;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleLimitCuda::end_of_step()
+{
+  double t_current;
+  if(not temperature->cudable) {cuda->cu_x->download();cuda->cu_v->download();}
+  t_current = temperature->compute_scalar();
+  if (t_current == 0.0)
+    error->all("Computed temperature for fix temp/rescale/limit/cuda cannot be 0.0");
+
+  double delta = update->ntimestep - update->beginstep;
+  delta /= update->endstep - update->beginstep;
+  double t_target = t_start + delta * (t_stop-t_start);
+
+  // rescale velocity of appropriate atoms if outside window
+
+  if (fabs(t_current-t_target) > t_window) {
+    t_target = t_current - fraction*(t_current-t_target);
+    double factor = sqrt(t_target/t_current);
+    double efactor = 0.5 * force->boltz * temperature->dof;
+
+    double **v = atom->v;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    double massone;
+    if(atom->rmass) massone = atom->rmass[0];
+    else massone = atom->mass[0];
+
+    double current_limit=sqrt(limit*force->boltz*t_target*temperature->dof/massone/force->mvv2e);
+    if (which == NOBIAS) {
+      energy += (t_current-t_target) * efactor;
+
+    
+	Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit);
+	
+    } else if (which == BIAS) {
+      energy += (t_current-t_target) * efactor;
+      if(not temperature->cudable)
+      {
+      	cuda->cu_x->download();cuda->cu_v->download();
+      for (int i = 0; i < nlocal; i++) {
+	if (mask[i] & groupbit) {
+	  temperature->remove_bias(i,v[i]);
+	  double vx = v[i][0] * factor;
+	  double vy = v[i][1] * factor;
+	  double vz = v[i][2] * factor;
+	  v[i][0]=vx>0?MIN(vx,current_limit):MAX(vx,-current_limit);
+	  v[i][1]=vy>0?MIN(vy,current_limit):MAX(vy,-current_limit);
+	  v[i][2]=vz>0?MIN(vz,current_limit):MAX(vz,-current_limit);
+	  
+	  temperature->restore_bias(i,v[i]);
+	}
+        }
+	  cuda->cu_v->upload();
+      }
+      else
+      {
+   	    temperature->remove_bias_all();
+	    Cuda_FixTempRescaleLimitCuda_EndOfStep(&cuda->shared_data, groupbit,factor,current_limit);
+	    temperature->restore_bias_all();
+      }
+    }
+
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixTempRescaleLimitCuda::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"temp") == 0) {
+    if (narg < 2) error->all("Illegal fix_modify command");
+    if (tflag) {
+      modify->delete_compute(id_temp);
+      tflag = 0;
+    }
+    delete [] id_temp;
+    int n = strlen(arg[1]) + 1;
+    id_temp = new char[n];
+    strcpy(id_temp,arg[1]);
+
+    int icompute = modify->find_compute(id_temp);
+    if (icompute < 0) error->all("Could not find fix_modify temperature ID");
+    temperature = modify->compute[icompute];
+
+    if (temperature->tempflag == 0)
+      error->all("Fix_modify temperature ID does not compute temperature");
+    if (temperature->igroup != igroup && comm->me == 0)
+      error->warning("Group for fix_modify temp != fix group");
+    if(not temperature->cudable) 
+	  error->warning("Fix temp/rescale/limit/cuda uses non cudable temperature compute");
+    return 2;
+  }
+  return 0;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixTempRescaleLimitCuda::reset_target(double t_new)
+{
+  t_start = t_stop = t_new;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixTempRescaleLimitCuda::compute_scalar()
+{
+  return energy;
+}
--- a/src/USER-CUDA/fix_temp_rescale_limit_cuda.h
+++ b/src/USER-CUDA/fix_temp_rescale_limit_cuda.h
@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(temp/rescale/limit/cuda,FixTempRescaleLimitCuda)
+
+#else
+
+#ifndef FIX_TEMP_RESCALE_LIMIT_CUDA_H
+#define FIX_TEMP_RESCALE_LIMIT_CUDA_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+class FixTempRescaleLimitCuda : public Fix {
+ public:
+  FixTempRescaleLimitCuda(class LAMMPS *, int, char **);
+  ~FixTempRescaleLimitCuda();
+  int setmask();
+  void init();
+  void end_of_step();
+  int modify_param(int, char **);
+  void reset_target(double);
+  double compute_scalar();
+
+ private:
+  class Cuda *cuda;
+  int which;
+  double t_start,t_stop,t_window;
+  double fraction,energy,efactor;
+  double limit;
+  char *id_temp;
+  class Compute *temperature;
+  int tflag;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/fix_viscous_cuda.cpp
+++ b/src/USER-CUDA/fix_viscous_cuda.cpp
@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include "fix_viscous_cuda.h"
+#include "fix_viscous_cuda_cu.h"
+#include "atom.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include "cuda_modify_flags.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+FixViscousCuda::FixViscousCuda(LAMMPS *lmp, int narg, char **arg) :
+  FixViscous(lmp, narg, arg)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	cu_gamma=NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixViscousCuda::~FixViscousCuda()
+{
+	delete cu_gamma;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixViscousCuda::setmask()
+{
+  int mask = 0;
+  mask |= POST_FORCE_CUDA;
+ // mask |= POST_FORCE_RESPA;
+ // mask |= MIN_POST_FORCE;
+  return mask;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+void FixViscousCuda::setup(int vflag)
+{
+   if(not cu_gamma)
+   cu_gamma = new cCudaData<double, F_FLOAT, x> (gamma,atom->ntypes+1);
+   Cuda_FixViscousCuda_Init(&cuda->shared_data);
+   cu_gamma->upload();
+ // if (strcmp(update->integrate_style,"verlet/cuda") == 0)
+    post_force(vflag);
+ /* else {
+    ((Respa *) update->integrate)->copy_flevel_f(nlevels_respa-1);
+    post_force_respa(vflag,nlevels_respa-1,0);
+    ((Respa *) update->integrate)->copy_f_flevel(nlevels_respa-1);
+  }*/
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixViscousCuda::min_setup(int vflag)
+{
+  Cuda_FixViscousCuda_Init(&cuda->shared_data);
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixViscousCuda::post_force(int vflag)
+{
+  // apply drag force to atoms in group
+  // direction is opposed to velocity vector
+  // magnitude depends on atom type
+
+  Cuda_FixViscousCuda_PostForce(&cuda->shared_data, groupbit,cu_gamma->dev_data());
+}
--- a/src/USER-CUDA/fix_viscous_cuda.h
+++ b/src/USER-CUDA/fix_viscous_cuda.h
@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(viscous/cuda,FixViscousCuda)
+
+#else
+
+#ifndef LMP_FIX_VISCOUS_CUDA_H
+#define LMP_FIX_VISCOUS_CUDA_H
+
+#include "fix_viscous.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class FixViscousCuda : public FixViscous {
+ public:
+  FixViscousCuda(class LAMMPS *, int, char **);
+  ~FixViscousCuda();
+  int setmask();
+  void setup(int);
+  void min_setup(int);
+  void post_force(int);
+  cCudaData<double, F_FLOAT, x>* cu_gamma;
+
+  private:
+  class Cuda *cuda;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/modify_cuda.cpp
+++ b/src/USER-CUDA/modify_cuda.cpp
@ -63,6 +63,8 @@ using namespace LAMMPS_NS;
 ModifyCuda::ModifyCuda(LAMMPS *lmp) : Modify(lmp)
 {
  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");

  n_initial_integrate_cuda = 0;
  n_post_integrate_cuda = 0;
--- a/src/USER-CUDA/neigh_full_cuda.cpp
+++ b/src/USER-CUDA/neigh_full_cuda.cpp
@ -21,7 +21,6 @@
   This software is distributed under the GNU General Public License.
 ------------------------------------------------------------------------- */

-#ifdef CUDA
 #include "neighbor_cuda.h"
 #include "neigh_list.h"
 #include "atom.h"
@ -313,5 +312,4 @@ return;
  MYDBG(printf(" # CUDA::NeighFullNSQCuda ... end\n");)
  */
 }
-#endif

--- a/src/USER-CUDA/neighbor_cuda.cpp
+++ b/src/USER-CUDA/neighbor_cuda.cpp
@ -36,6 +36,8 @@ enum{NSQ,BIN,MULTI};     // also in neigh_list.cpp
 NeighborCuda::NeighborCuda(LAMMPS *lmp) : Neighbor(lmp)
 {
  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
 }

 /* ---------------------------------------------------------------------- */
--- a/src/USER-CUDA/pair_born_coul_long_cuda.cpp
+++ b/src/USER-CUDA/pair_born_coul_long_cuda.cpp
@ -0,0 +1,186 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_born_coul_long_cuda.h"
+#include "pair_born_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairBornCoulLongCuda::PairBornCoulLongCuda(LAMMPS *lmp) : PairBornCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::allocate()
+{
+	if(! allocated) PairBornCoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = sigma;
+		cuda->shared_data.pair.coeff3  = a;
+		cuda->shared_data.pair.coeff4  = c;
+		cuda->shared_data.pair.coeff5  = d;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBornCoulLongCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+	#ifdef CUDA_USE_BINNING
+	Cuda_PairBornCoulLongCuda(& cuda->shared_data, eflag, vflag);
+	#else
+	Cuda_PairBornCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+	#endif
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBornCoulLongCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::settings(int narg, char **arg)
+{
+	PairBornCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBornCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairBornCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairBornCoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style born/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBornCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list\n");)
+	PairBornCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBornCoulLongCuda::init_list end\n");)
+}
+
+void PairBornCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBornCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
--- a/src/USER-CUDA/pair_born_coul_long_cuda.h
+++ b/src/USER-CUDA/pair_born_coul_long_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(born/coul/long/cuda,PairBornCoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_BORN_COUL_LONG_CUDA_H
+#define LMP_PAIR_BORN_COUL_LONG_CUDA_H
+
+#include "pair_born_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairBornCoulLongCuda : public PairBornCoulLong
+{
+	public:
+		PairBornCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp
+++ b/src/USER-CUDA/pair_buck_coul_cut_cuda.cpp
@ -0,0 +1,173 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_buck_coul_cut_cuda.h"
+#include "pair_buck_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckCoulCutCuda::PairBuckCoulCutCuda(LAMMPS *lmp) : PairBuckCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::allocate()
+{
+	if(! allocated) PairBuckCoulCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut_coul     = cut_coul;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = buck1;
+		cuda->shared_data.pair.coeff3  = buck2;
+		cuda->shared_data.pair.coeff4  = a;
+		cuda->shared_data.pair.coeff5  = c;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBuckCoulCutCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+	
+	Cuda_PairBuckCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+  
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBuckCoulCutCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::settings(int narg, char **arg)
+{
+	PairBuckCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulCutCuda::coeff(int narg, char **arg)
+{
+	PairBuckCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairBuckCoulCutCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style buck/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+   cuda->shared_data.pair.cut_coulsq_global=cut_coul_global * cut_coul_global;
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBuckCoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list\n");)
+	PairBuckCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBuckCoulCutCuda::init_list end\n");)
+}
+
+void PairBuckCoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBuckCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
--- a/src/USER-CUDA/pair_buck_coul_cut_cuda.h
+++ b/src/USER-CUDA/pair_buck_coul_cut_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/coul/cut/cuda,PairBuckCoulCutCuda)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_COUL_CUT_CUDA_H
+#define LMP_PAIR_BUCK_COUL_CUT_CUDA_H
+
+#include "pair_buck_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCoulCutCuda : public PairBuckCoulCut
+{
+	public:
+		PairBuckCoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_buck_coul_long_cuda.cpp
+++ b/src/USER-CUDA/pair_buck_coul_long_cuda.cpp
@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_buck_coul_long_cuda.h"
+#include "pair_buck_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairBuckCoulLongCuda::PairBuckCoulLongCuda(LAMMPS *lmp) : PairBuckCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::allocate()
+{
+	if(! allocated) PairBuckCoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = buck1;
+		cuda->shared_data.pair.coeff3  = buck2;
+		cuda->shared_data.pair.coeff4  = a;
+		cuda->shared_data.pair.coeff5  = c;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBuckCoulLongCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairBuckCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBuckCoulLongCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::settings(int narg, char **arg)
+{
+	PairBuckCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairBuckCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairBuckCoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style buck/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBuckCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list\n");)
+	PairBuckCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBuckCoulLongCuda::init_list end\n");)
+}
+
+void PairBuckCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBuckCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
--- a/src/USER-CUDA/pair_buck_coul_long_cuda.h
+++ b/src/USER-CUDA/pair_buck_coul_long_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/coul/long/cuda,PairBuckCoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_COUL_LONG_CUDA_H
+#define LMP_PAIR_BUCK_COUL_LONG_CUDA_H
+
+#include "pair_buck_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCoulLongCuda : public PairBuckCoulLong
+{
+	public:
+		PairBuckCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_buck_cuda.cpp
+++ b/src/USER-CUDA/pair_buck_cuda.cpp
@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_buck_cuda.h"
+#include "pair_buck_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairBuckCuda::PairBuckCuda(LAMMPS *lmp) : PairBuck(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairBuckCuda::allocate()
+{
+	if(! allocated) PairBuck::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = rhoinv;
+		cuda->shared_data.pair.coeff2  = buck1;
+		cuda->shared_data.pair.coeff3  = buck2;
+		cuda->shared_data.pair.coeff4  = a;
+		cuda->shared_data.pair.coeff5  = c;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCuda::compute(int eflag, int vflag)
+{
+	MYDBG( printf("PairBuckCuda compute start\n"); fflush(stdout);)
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairBuckCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	MYDBG( printf("PairBuckCuda compute end\n"); fflush(stdout);)
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCuda::settings(int narg, char **arg)
+{
+	PairBuck::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairBuckCuda::coeff(int narg, char **arg)
+{
+	PairBuck::coeff(narg, arg);
+	allocate();
+}
+
+void PairBuckCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style buck/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (strcmp(update->integrate_style,"respa") == 0) error->all("Integrate Style Respa is not supported by pair style buck/coul/long/cuda");
+  	
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairBuckCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairBuckCuda::init_list\n");)
+	PairBuck::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairBuckCuda::init_list end\n");)
+}
+
+void PairBuckCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairBuck::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
--- a/src/USER-CUDA/pair_buck_cuda.h
+++ b/src/USER-CUDA/pair_buck_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(buck/cuda,PairBuckCuda)
+
+#else
+
+#ifndef LMP_PAIR_BUCK_CUDA_H
+#define LMP_PAIR_BUCK_CUDA_H
+
+#include "pair_buck.h"
+
+namespace LAMMPS_NS {
+
+class PairBuckCuda : public PairBuck
+{
+	public:
+		PairBuckCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp
+++ b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.cpp
@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_coul_cut_cuda.h"
+#include "pair_cg_cmm_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCoulCutCuda::PairCGCMMCoulCutCuda(LAMMPS *lmp) : PairCGCMMCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+  if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::allocate()
+{
+	if(! allocated) PairCGCMMCoulCut::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::settings(int narg, char **arg)
+{
+	PairCGCMMCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulCutCuda::coeff(int narg, char **arg)
+{
+	PairCGCMMCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCoulCutCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  cut_respa=NULL;
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+
+  MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_style end\n"); )
+}
+
+void PairCGCMMCoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_list\n");)
+	PairCGCMMCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCoulCutCuda::init_list end\n");)
+}
+
+void PairCGCMMCoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMMCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h
+++ b/src/USER-CUDA/pair_cg_cmm_coul_cut_cuda.h
@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/coul/cut/cuda,PairCGCMMCoulCutCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_COUL_CUT_CUDA_H
+#define PAIR_CG_CMM_COUL_CUT_CUDA_H
+
+#include "pair_cg_cmm_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCoulCutCuda : public PairCGCMMCoulCut
+{
+	public:
+		PairCGCMMCoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp
+++ b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.cpp
@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_coul_debye_cuda.h"
+#include "pair_cg_cmm_coul_debye_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCoulDebyeCuda::PairCGCMMCoulDebyeCuda(LAMMPS *lmp) : PairCGCMMCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::allocate()
+{
+	if(! allocated) PairCGCMMCoulCut::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCoulDebyeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::settings(int narg, char **arg)
+{
+	PairCGCMMCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulDebyeCuda::coeff(int narg, char **arg)
+{
+	PairCGCMMCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCoulDebyeCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  cut_respa=NULL;
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+
+  MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_style end\n"); )
+}
+
+void PairCGCMMCoulDebyeCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_list\n");)
+	PairCGCMMCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCoulDebyeCuda::init_list end\n");)
+}
+
+void PairCGCMMCoulDebyeCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMMCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h
+++ b/src/USER-CUDA/pair_cg_cmm_coul_debye_cuda.h
@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/coul/debye/cuda,PairCGCMMCoulDebyeCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_COUL_DEBYE_CUDA_H
+#define PAIR_CG_CMM_COUL_DEBYE_CUDA_H
+
+#include "pair_cg_cmm_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCoulDebyeCuda : public PairCGCMMCoulCut
+{
+	public:
+		PairCGCMMCoulDebyeCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp
+++ b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.cpp
@ -0,0 +1,206 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_coul_long_cuda.h"
+#include "pair_cg_cmm_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCoulLongCuda::PairCGCMMCoulLongCuda(LAMMPS *lmp) : PairCGCMMCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::allocate()
+{
+	if(! allocated) PairCGCMMCoulLong::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::settings(int narg, char **arg)
+{
+	PairCGCMMCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairCGCMMCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCoulLongCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  cut_respa=NULL;
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+  MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_style end\n"); )
+}
+
+void PairCGCMMCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_list\n");)
+	PairCGCMMCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCoulLongCuda::init_list end\n");)
+}
+
+void PairCGCMMCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMMCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h
+++ b/src/USER-CUDA/pair_cg_cmm_coul_long_cuda.h
@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/coul/long/cuda,PairCGCMMCoulLongCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_COUL_LONG_CUDA_H
+#define PAIR_CG_CMM_COUL_LONG_CUDA_H
+
+#include "pair_cg_cmm_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCoulLongCuda : public PairCGCMMCoulLong
+{
+	public:
+		PairCGCMMCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_cg_cmm_cuda.cpp
+++ b/src/USER-CUDA/pair_cg_cmm_cuda.cpp
@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_cg_cmm_cuda.h"
+#include "pair_cg_cmm_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairCGCMMCuda::PairCGCMMCuda(LAMMPS *lmp) : PairCGCMM(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cg_type_double = NULL;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairCGCMMCuda::allocate()
+{
+	if(! allocated) PairCGCMM::allocate();
+	int n = atom->ntypes;
+	if(! allocated2)
+	{
+		allocated2 = true;
+		
+  
+  		memory->create(cg_type_double,n+1,n+1,"paircg:cgtypedouble");
+  		
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.coeff5  = cg_type_double;
+	    /*cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_cg_type_double_gm = new cCudaData<double, F_FLOAT, x> ((double*)cg_type_double, &cuda->shared_data.pair.coeff5_gm, (atom->ntypes+1)*(atom->ntypes+1));*/
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+	}
+  	for (int i = 1; i <= n; i++) {
+      for (int j = i; j <= n; j++) {
+        cg_type_double[i][j] = cg_type[i][j];
+        cg_type_double[j][i] = cg_type[i][j];
+      }
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairCGCMMCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCuda::settings(int narg, char **arg)
+{
+	PairCGCMM::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCGCMMCuda::coeff(int narg, char **arg)
+{
+	PairCGCMM::coeff(narg, arg);
+	allocate();
+}
+
+void PairCGCMMCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairCGCMMCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  cut_respa=NULL;
+
+  MYDBG(printf("# CUDA PairCGCMMCuda::init_style end\n"); )
+}
+
+void PairCGCMMCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairCGCMMCuda::init_list\n");)
+	PairCGCMM::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairCGCMMCuda::init_list end\n");)
+}
+
+void PairCGCMMCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairCGCMM::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_cg_cmm_cuda.h
+++ b/src/USER-CUDA/pair_cg_cmm_cuda.h
@ -0,0 +1,64 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(cg/cmm/cuda,PairCGCMMCuda)
+
+#else
+
+#ifndef PAIR_CG_CMM_CUDA_H
+#define PAIR_CG_CMM_CUDA_H
+
+#include "pair_cg_cmm.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairCGCMMCuda : public PairCGCMM
+{
+	public:
+		PairCGCMMCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		double** cg_type_double;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_cg_type_double_gm;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_eam_alloy_cuda.cpp
+++ b/src/USER-CUDA/pair_eam_alloy_cuda.cpp
@ -0,0 +1,326 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Stephen Foiles (SNL), Murray Daw (SNL)
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_eam_alloy_cuda.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMAlloyCuda::PairEAMAlloyCuda(LAMMPS *lmp) : PairEAMCuda(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyCuda::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all("Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all("Incorrect args for pair coefficients");
+
+  // read EAM setfl file
+
+  if (setfl) {
+    for (i = 0; i < setfl->nelements; i++) delete [] setfl->elements[i];
+    delete [] setfl->elements;
+    delete [] setfl->mass;
+    memory->destroy(setfl->frho);
+    memory->destroy(setfl->rhor);
+    memory->destroy(setfl->z2r);
+    delete setfl;
+  }
+  setfl = new Setfl();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < setfl->nelements; j++)
+      if (strcmp(arg[i],setfl->elements[j]) == 0) break;
+    if (j < setfl->nelements) map[i-2] = j;
+    else error->all("No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+	setflag[i][j] = 1;
+	if (i == j) atom->set_mass(i,setfl->mass[map[i]]);
+	count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all("Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyCuda::read_file(char *filename)
+{
+  Setfl *file = setfl;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = fopen(filename,"r");
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all("Incorrect element names in EAM potential file");
+  
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  char *first = strtok(line," \t\n\r\f");
+  while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+	   &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,"pair:frho");
+  memory->create(file->rhor,file->nelements,file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,file->nr+1,
+		 "pair:z2r");
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+    if (me == 0) grab(fptr,file->nr,&file->rhor[i][1]);
+    MPI_Bcast(&file->rhor[i][1],file->nr,MPI_DOUBLE,0,world);
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMAlloyCuda::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from setfl file
+
+  nrho = setfl->nrho;
+  nr = setfl->nr;
+  drho = setfl->drho;
+  dr = setfl->dr;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of setfl elements + 1 for zero array
+  
+  nfrho = setfl->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = setfl->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = # of setfl elements
+
+  nrhor = setfl->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element's rhor to global rhor
+
+  for (i = 0; i < setfl->nelements; i++)
+    for (m = 1; m <= nr; m++) rhor[i][m] = setfl->rhor[i][m];
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for setfl files, I,J mapping only depends on I
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of setfl elements
+
+  nz2r = setfl->nelements * (setfl->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < setfl->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = setfl->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+	type2z2r[i][j] = 0;
+	continue;
+      }
+      if (irow < icol) {
+	irow = map[j];
+	icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
--- a/src/USER-CUDA/pair_eam_alloy_cuda.h
+++ b/src/USER-CUDA/pair_eam_alloy_cuda.h
@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/alloy/cuda,PairEAMAlloyCuda)
+
+#else
+
+#ifndef LMP_PAIR_EAM_CUDA_ALLOY_H
+#define LMP_PAIR_EAM_CUDA_ALLOY_H
+
+#include "pair_eam_cuda.h"
+
+namespace LAMMPS_NS {
+
+// use virtual public since this class is parent in multiple inheritance
+
+class PairEAMAlloyCuda : virtual public PairEAMCuda {
+ public:
+  PairEAMAlloyCuda(class LAMMPS *);
+  virtual ~PairEAMAlloyCuda() {}
+  void coeff(int, char **);
+
+ protected:
+  class Cuda *cuda;
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_eam_cuda.cpp
+++ b/src/USER-CUDA/pair_eam_cuda.cpp
@ -0,0 +1,239 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_eam_cuda.h"
+#include "pair_eam_cuda_cu.h"
+#include "pair_virial_compute_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMCuda::PairEAMCuda(LAMMPS *lmp) : PairEAM(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.override_block_per_atom = 0;
+	
+	cuda->setSystemParams();
+	cu_rho=NULL;
+	cu_fp=NULL;
+    cu_frho_spline = NULL;
+    cu_z2r_spline = NULL;
+    cu_rhor_spline = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairEAMCuda::allocate()
+{
+	if(! allocated) PairEAM::allocate();
+		cuda->shared_data.pair.cutsq     = cutsq;
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::compute(int eflag, int vflag)
+{
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq;
+    cuda->shared_data.pair.use_block_per_atom = 0;
+    cuda->shared_data.pair.collect_forces_later = 0;
+    if (atom->nmax > nmax) {
+    memory->destroy(rho);
+    memory->destroy(fp);
+    nmax = atom->nmax;
+    memory->create(rho,nmax,"pair:rho");
+    memory->create(fp,nmax,"pair:fp");
+	delete cu_rho;
+	delete cu_fp;
+	cu_rho = new cCudaData<double, F_FLOAT, x> (rho, atom->nmax);
+	cu_fp  = new cCudaData<double, F_FLOAT, x> (fp, atom->nmax);
+	Cuda_PairEAMCuda_Init(&cuda->shared_data,rdr,rdrho,nfrho,nrhor,nr,nrho,nz2r,
+		cu_frho_spline->dev_data(),cu_rhor_spline->dev_data(),cu_z2r_spline->dev_data(),
+		cu_rho->dev_data(),cu_fp->dev_data(),type2frho,type2z2r,type2rhor);
+  	}
+
+	
+
+	if(eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+    
+	Cuda_PairEAM1Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag,eflag_atom,vflag_atom);
+    comm->forward_comm_pair(this);
+	Cuda_PairEAM2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag,eflag_atom,vflag_atom);
+    
+	if(eflag) cuda->cu_eng_vdwl->download();
+	if(vflag) cuda->cu_virial->download();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::settings(int narg, char **arg)
+{
+	PairEAM::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cutforcesq;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::coeff(int narg, char **arg)
+{
+	PairEAM::coeff(narg, arg);
+	allocate();
+}
+
+void PairEAMCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairEAMCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+  file2array();
+  array2spline();
+  int irequest;
+ 
+ 
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+
+	delete cu_rhor_spline;
+	delete cu_z2r_spline;
+	delete cu_frho_spline;
+	
+	cu_rhor_spline = new cCudaData<double, F_FLOAT, xyz>((double*)rhor_spline,nrhor,nr+1,EAM_COEFF_LENGTH);
+	cu_z2r_spline = new cCudaData<double, F_FLOAT, xyz>((double*)z2r_spline,nz2r,nr+1,EAM_COEFF_LENGTH);
+	cu_frho_spline = new cCudaData<double, F_FLOAT, xyz>((double*)frho_spline,nfrho,nrho+1,EAM_COEFF_LENGTH);
+
+	cu_rhor_spline->upload();
+	cu_z2r_spline->upload();
+	cu_frho_spline->upload();
+	
+  MYDBG(printf("# CUDA PairEAMCuda::init_style end\n"); )
+}
+
+void PairEAMCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairEAMCuda::init_list\n");)
+	PairEAM::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairEAMCuda::init_list end\n");)
+}
+
+void PairEAMCuda::array2spline()
+{
+  rdr = 1.0/dr;
+  rdrho = 1.0/drho;
+
+  memory->destroy(frho_spline);
+  memory->destroy(rhor_spline);
+  memory->destroy(z2r_spline);
+
+  memory->create(frho_spline,nfrho,nrho+1,7,"pair:frho");
+  memory->create(rhor_spline,nrhor,nr+1,7,"pair:rhor");
+  memory->create(z2r_spline,nz2r,nr+1,7,"pair:z2r");
+
+  for (int i = 0; i < nfrho; i++){
+    interpolate(nrho,drho,frho[i],frho_spline[i]);
+    for(int j=0;j<nrho+1;j++)
+      frho_spline[i][j][7]=frho_spline[i][j][3];
+  }
+  
+  for (int i = 0; i < nrhor; i++){
+    interpolate(nr,dr,rhor[i],rhor_spline[i]);
+    for(int j=0;j<nr+1;j++)
+      rhor_spline[i][j][7]=rhor_spline[i][j][3];
+  }
+
+  for (int i = 0; i < nz2r; i++){
+    interpolate(nr,dr,z2r[i],z2r_spline[i]);
+    for(int j=0;j<nr+1;j++)
+      z2r_spline[i][j][7]=z2r_spline[i][j][3];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int PairEAMCuda::pack_comm(int n, int *iswap, double *buf, int pbc_flag, int *pbc)
+{
+  Cuda_PairEAMCuda_PackComm(&cuda->shared_data,n,*iswap,buf); 
+  if(sizeof(F_FLOAT)<sizeof(double)) return 1;
+  else return 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairEAMCuda::unpack_comm(int n, int first, double *buf)
+{
+  Cuda_PairEAMCuda_UnpackComm(&cuda->shared_data,n,first,buf,cu_fp->dev_data()); 
+}
+
--- a/src/USER-CUDA/pair_eam_cuda.h
+++ b/src/USER-CUDA/pair_eam_cuda.h
@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+#ifdef PAIR_CLASS
+
+PairStyle(eam/cuda,PairEAMCuda)
+
+#else
+
+#ifndef PAIR_EAM_CUDA_H
+#define PAIR_EAM_CUDA_H
+
+#include "cuda_data.h"
+#include "pair_eam.h"
+
+namespace LAMMPS_NS {
+
+class PairEAMCuda : public PairEAM
+{
+	public:
+		PairEAMCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void array2spline();
+		int pack_comm(int n, int *iswap, double *buf, int pbc_flag, int *pbc);
+		void unpack_comm(int n, int first, double *buf);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double, F_FLOAT, x>* cu_rho;
+		cCudaData<double, F_FLOAT, x>* cu_fp;
+	    cCudaData<double, F_FLOAT, xyz>* cu_rhor_spline;
+	    cCudaData<double, F_FLOAT, xyz>* cu_z2r_spline;
+	    cCudaData<double, F_FLOAT, xyz>* cu_frho_spline;
+
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_eam_fs_cuda.cpp
+++ b/src/USER-CUDA/pair_eam_fs_cuda.cpp
@ -0,0 +1,335 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Tim Lau (MIT)
+------------------------------------------------------------------------- */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_eam_fs_cuda.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXLINE 1024
+
+/* ---------------------------------------------------------------------- */
+
+PairEAMFSCuda::PairEAMFSCuda(LAMMPS *lmp) : PairEAMCuda(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+  one_coeff = 1;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+   read EAM Finnis-Sinclair file
+------------------------------------------------------------------------- */
+
+void PairEAMFSCuda::coeff(int narg, char **arg)
+{
+  int i,j;
+
+  if (!allocated) allocate();
+
+  if (narg != 3 + atom->ntypes)
+    error->all("Incorrect args for pair coefficients");
+
+  // insure I,J args are * *
+
+  if (strcmp(arg[0],"*") != 0 || strcmp(arg[1],"*") != 0)
+    error->all("Incorrect args for pair coefficients");
+
+  // read EAM Finnis-Sinclair file
+
+  if (fs) {
+    for (i = 0; i < fs->nelements; i++) delete [] fs->elements[i];
+    delete [] fs->elements;
+    delete [] fs->mass;
+    memory->destroy(fs->frho);
+    memory->destroy(fs->rhor);
+    memory->destroy(fs->z2r);
+    delete fs;
+  }
+  fs = new Fs();
+  read_file(arg[2]);
+
+  // read args that map atom types to elements in potential file
+  // map[i] = which element the Ith atom type is, -1 if NULL
+
+  for (i = 3; i < narg; i++) {
+    if (strcmp(arg[i],"NULL") == 0) {
+      map[i-2] = -1;
+      continue;
+    }
+    for (j = 0; j < fs->nelements; j++)
+      if (strcmp(arg[i],fs->elements[j]) == 0) break;
+    if (j < fs->nelements) map[i-2] = j;
+    else error->all("No matching element in EAM potential file");
+  }
+
+  // clear setflag since coeff() called once with I,J = * *
+
+  int n = atom->ntypes;
+  for (i = 1; i <= n; i++)
+    for (j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  // set setflag i,j for type pairs where both are mapped to elements
+  // set mass of atom type if i = j
+
+  int count = 0;
+  for (i = 1; i <= n; i++) {
+    for (j = i; j <= n; j++) {
+      if (map[i] >= 0 && map[j] >= 0) {
+	setflag[i][j] = 1;
+	if (i == j) atom->set_mass(i,fs->mass[map[i]]);
+	count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all("Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   read a multi-element DYNAMO setfl file
+------------------------------------------------------------------------- */
+
+void PairEAMFSCuda::read_file(char *filename)
+{
+  Fs *file = fs;
+
+  // open potential file
+
+  int me = comm->me;
+  FILE *fptr;
+  char line[MAXLINE];
+
+  if (me == 0) {
+    fptr = fopen(filename,"r");
+    if (fptr == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open EAM potential file %s",filename);
+      error->one(str);
+    }
+  }
+
+  // read and broadcast header
+  // extract element names from nelements line
+
+  int n;
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    fgets(line,MAXLINE,fptr);
+    n = strlen(line) + 1;
+  }
+  MPI_Bcast(&n,1,MPI_INT,0,world);
+  MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+  sscanf(line,"%d",&file->nelements);
+  int nwords = atom->count_words(line);
+  if (nwords != file->nelements + 1)
+    error->all("Incorrect element names in EAM potential file");
+  
+  char **words = new char*[file->nelements+1];
+  nwords = 0;
+  char *first = strtok(line," \t\n\r\f");
+  while (words[nwords++] = strtok(NULL," \t\n\r\f")) continue;
+
+  file->elements = new char*[file->nelements];
+  for (int i = 0; i < file->nelements; i++) {
+    n = strlen(words[i]) + 1;
+    file->elements[i] = new char[n];
+    strcpy(file->elements[i],words[i]);
+  }
+  delete [] words;
+
+  if (me == 0) {
+    fgets(line,MAXLINE,fptr);
+    sscanf(line,"%d %lg %d %lg %lg",
+	   &file->nrho,&file->drho,&file->nr,&file->dr,&file->cut);
+  }
+
+  MPI_Bcast(&file->nrho,1,MPI_INT,0,world);
+  MPI_Bcast(&file->drho,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->nr,1,MPI_INT,0,world);
+  MPI_Bcast(&file->dr,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&file->cut,1,MPI_DOUBLE,0,world);
+
+  file->mass = new double[file->nelements];
+  memory->create(file->frho,file->nelements,file->nrho+1,
+					      "pair:frho");
+  memory->create(file->rhor,file->nelements,file->nelements,
+		 file->nr+1,"pair:rhor");
+  memory->create(file->z2r,file->nelements,file->nelements,
+		 file->nr+1,"pair:z2r");
+  int i,j,tmp;
+  for (i = 0; i < file->nelements; i++) {
+    if (me == 0) {
+      fgets(line,MAXLINE,fptr);
+      sscanf(line,"%d %lg",&tmp,&file->mass[i]);
+    }
+    MPI_Bcast(&file->mass[i],1,MPI_DOUBLE,0,world);
+
+    if (me == 0) grab(fptr,file->nrho,&file->frho[i][1]);
+    MPI_Bcast(&file->frho[i][1],file->nrho,MPI_DOUBLE,0,world);
+
+    for (j = 0; j < file->nelements; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->rhor[i][j][1]);
+      MPI_Bcast(&file->rhor[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+  }
+
+  for (i = 0; i < file->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      if (me == 0) grab(fptr,file->nr,&file->z2r[i][j][1]);
+      MPI_Bcast(&file->z2r[i][j][1],file->nr,MPI_DOUBLE,0,world);
+    }
+
+  // close the potential file
+
+  if (me == 0) fclose(fptr);
+}
+
+/* ----------------------------------------------------------------------
+   copy read-in setfl potential to standard array format
+------------------------------------------------------------------------- */
+
+void PairEAMFSCuda::file2array()
+{
+  int i,j,m,n;
+  int ntypes = atom->ntypes;
+
+  // set function params directly from fs file
+
+  nrho = fs->nrho;
+  nr = fs->nr;
+  drho = fs->drho;
+  dr = fs->dr;
+
+  // ------------------------------------------------------------------
+  // setup frho arrays
+  // ------------------------------------------------------------------
+
+  // allocate frho arrays
+  // nfrho = # of fs elements + 1 for zero array
+  
+  nfrho = fs->nelements + 1;
+  memory->destroy(frho);
+  memory->create(frho,nfrho,nrho+1,"pair:frho");
+
+  // copy each element's frho to global frho
+
+  for (i = 0; i < fs->nelements; i++)
+    for (m = 1; m <= nrho; m++) frho[i][m] = fs->frho[i][m];
+
+  // add extra frho of zeroes for non-EAM types to point to (pair hybrid)
+  // this is necessary b/c fp is still computed for non-EAM atoms
+
+  for (m = 1; m <= nrho; m++) frho[nfrho-1][m] = 0.0;
+
+  // type2frho[i] = which frho array (0 to nfrho-1) each atom type maps to
+  // if atom type doesn't point to element (non-EAM atom in pair hybrid)
+  // then map it to last frho array of zeroes
+
+  for (i = 1; i <= ntypes; i++)
+    if (map[i] >= 0) type2frho[i] = map[i];
+    else type2frho[i] = nfrho-1;
+
+  // ------------------------------------------------------------------
+  // setup rhor arrays
+  // ------------------------------------------------------------------
+
+  // allocate rhor arrays
+  // nrhor = square of # of fs elements
+
+  nrhor = fs->nelements * fs->nelements;
+  memory->destroy(rhor);
+  memory->create(rhor,nrhor,nr+1,"pair:rhor");
+
+  // copy each element pair rhor to global rhor
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j < fs->nelements; j++) {
+      for (m = 1; m <= nr; m++) rhor[n][m] = fs->rhor[i][j][m];
+      n++;
+    }
+
+  // type2rhor[i][j] = which rhor array (0 to nrhor-1) each type pair maps to
+  // for fs files, there is a full NxN set of rhor arrays
+  // OK if map = -1 (non-EAM atom in pair hybrid) b/c type2rhor not used
+
+  for (i = 1; i <= ntypes; i++)
+    for (j = 1; j <= ntypes; j++)
+      type2rhor[i][j] = map[i] * fs->nelements + map[j];
+
+  // ------------------------------------------------------------------
+  // setup z2r arrays
+  // ------------------------------------------------------------------
+
+  // allocate z2r arrays
+  // nz2r = N*(N+1)/2 where N = # of fs elements
+
+  nz2r = fs->nelements * (fs->nelements+1) / 2;
+  memory->destroy(z2r);
+  memory->create(z2r,nz2r,nr+1,"pair:z2r");
+
+  // copy each element pair z2r to global z2r, only for I >= J
+
+  n = 0;
+  for (i = 0; i < fs->nelements; i++)
+    for (j = 0; j <= i; j++) {
+      for (m = 1; m <= nr; m++) z2r[n][m] = fs->z2r[i][j][m];
+      n++;
+    }
+
+  // type2z2r[i][j] = which z2r array (0 to nz2r-1) each type pair maps to
+  // set of z2r arrays only fill lower triangular Nelement matrix
+  // value = n = sum over rows of lower-triangular matrix until reach irow,icol
+  // swap indices when irow < icol to stay lower triangular
+  // if map = -1 (non-EAM atom in pair hybrid):
+  //   type2z2r is not used by non-opt
+  //   but set type2z2r to 0 since accessed by opt
+
+  int irow,icol;
+  for (i = 1; i <= ntypes; i++) {
+    for (j = 1; j <= ntypes; j++) {
+      irow = map[i];
+      icol = map[j];
+      if (irow == -1 || icol == -1) {
+	type2z2r[i][j] = 0;
+	continue;
+      }
+      if (irow < icol) {
+	irow = map[j];
+	icol = map[i];
+      }
+      n = 0;
+      for (m = 0; m < irow; m++) n += m + 1;
+      n += icol;
+      type2z2r[i][j] = n;
+    }
+  }
+}
--- a/src/USER-CUDA/pair_eam_fs_cuda.h
+++ b/src/USER-CUDA/pair_eam_fs_cuda.h
@ -0,0 +1,44 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(eam/fs/cuda,PairEAMFSCuda)
+
+#else
+
+#ifndef LMP_PAIR_EAM_FS_CUDA_H
+#define LMP_PAIR_EAM_FS_CUDA_H
+
+#include "pair_eam_cuda.h"
+
+namespace LAMMPS_NS {
+
+// use virtual public since this class is parent in multiple inheritance
+
+class PairEAMFSCuda : virtual public PairEAMCuda {
+ public:
+  PairEAMFSCuda(class LAMMPS *);
+  virtual ~PairEAMFSCuda() {}
+  void coeff(int, char **);
+
+ protected:
+  class Cuda *cuda;
+  void read_file(char *);
+  void file2array();
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_gran_hooke_cuda.cpp
+++ b/src/USER-CUDA/pair_gran_hooke_cuda.cpp
@ -0,0 +1,247 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_gran_hooke_cuda.h"
+#include "pair_gran_hooke_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "modify.h"
+#include "fix_pour.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairGranHookeCuda::PairGranHookeCuda(LAMMPS *lmp) : PairGranHooke(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairGranHookeCuda::allocate()
+{
+	if(! allocated) PairGranHooke::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+ 		int n = atom->ntypes;
+		cuda->shared_data.pair.cutsq     = cutsq;
+		memory->create(cuda->shared_data.pair.coeff1,n+1,n+1,
+			       "pair:cuda_coeff1");
+		memory->create(cuda->shared_data.pair.coeff2,
+			       n+1,n+1,"pair:cuda_coeff2");
+		cuda->shared_data.pair.coeff1[0][0]=kn;
+		cuda->shared_data.pair.coeff1[0][1]=kt;
+		cuda->shared_data.pair.coeff1[1][0]=gamman;
+		cuda->shared_data.pair.coeff1[1][1]=gammat;
+		cuda->shared_data.pair.coeff2[0][0]=xmu;
+		cuda->shared_data.pair.coeff2[0][1]=dampflag;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeCuda::compute(int eflag, int vflag)
+{
+	     cuda->shared_data.pair.use_block_per_atom = 0;
+	//cuda->cu_debugdata->memset_device(0);
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairGranHookeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	//cuda->cu_debugdata->download();
+	//printf("%lf %lf %lf %lf %lf %lf\n",1.0e-6*cuda->debugdata[0],1.0e-6*cuda->debugdata[1],1.0e-6*cuda->debugdata[2],1.0e-6*cuda->debugdata[3],1.0e-6*cuda->debugdata[4],1.0e-6*cuda->debugdata[5]);
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeCuda::settings(int narg, char **arg)
+{
+	PairGranHooke::settings(narg, arg);
+ }
+
+/* ---------------------------------------------------------------------- */
+
+void PairGranHookeCuda::coeff(int narg, char **arg)
+{
+	PairGranHooke::coeff(narg, arg);
+	allocate();
+}
+
+void PairGranHookeCuda::init_style()
+{
+	int i;
+	MYDBG(printf("# CUDA PairGranHookeCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->gran = 1;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+  if (!atom->radius_flag || !atom->omega_flag || !atom->torque_flag)
+    error->all("Pair granular requires atom attributes radius, omega, torque");
+  if (comm->ghost_velocity == 0)
+    error->all("Pair granular requires ghost atoms store velocity");
+
+  // need a half neigh list and optionally a granular history neigh list
+
+  dt = update->dt;
+
+
+
+  // check for Fix freeze and set freeze_group_bit
+
+  for (i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"freeze") == 0) break;
+  if (i < modify->nfix) freeze_group_bit = modify->fix[i]->groupbit;
+  else freeze_group_bit = 0;
+
+  cuda->shared_data.pair.freeze_group_bit=freeze_group_bit;
+  // check for Fix pour and set pour_type and pour_maxdiam
+
+  int pour_type = 0;
+  double pour_maxrad = 0.0;
+  for (i = 0; i < modify->nfix; i++)
+    if (strcmp(modify->fix[i]->style,"pour") == 0) break;
+  if (i < modify->nfix) {
+    pour_type = ((FixPour *) modify->fix[i])->ntype;
+    pour_maxrad = ((FixPour *) modify->fix[i])->radius_hi;
+  }
+
+  // set maxrad_dynamic and maxrad_frozen for each type
+  // include future Fix pour particles as dynamic
+
+  for (i = 1; i <= atom->ntypes; i++)
+    onerad_dynamic[i] = onerad_frozen[i] = 0.0;
+  if (pour_type) onerad_dynamic[pour_type] = pour_maxrad;
+
+  double *radius = atom->radius;
+  int *mask = atom->mask;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++){
+    if (mask[i] & freeze_group_bit)
+      onerad_frozen[type[i]] = MAX(onerad_frozen[type[i]],radius[i]);
+    else
+      onerad_dynamic[type[i]] = MAX(onerad_dynamic[type[i]],radius[i]);
+  }
+  
+  MPI_Allreduce(&onerad_dynamic[1],&maxrad_dynamic[1],atom->ntypes,
+		MPI_DOUBLE,MPI_MAX,world);
+  MPI_Allreduce(&onerad_frozen[1],&maxrad_frozen[1],atom->ntypes,
+		MPI_DOUBLE,MPI_MAX,world);
+
+  MYDBG(printf("# CUDA PairGranHookeCuda::init_style end\n"); )
+}
+
+void PairGranHookeCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairGranHookeCuda::init_list\n");)
+	PairGranHooke::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairGranHookeCuda::init_list end\n");)
+}
+
+void PairGranHookeCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairGranHooke::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_gran_hooke_cuda.h
+++ b/src/USER-CUDA/pair_gran_hooke_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gran/hooke/cuda,PairGranHookeCuda)
+
+#else
+
+#ifndef PAIR_GRAN_HOOKE_CUDA_H
+#define PAIR_GRAN_HOOKE_CUDA_H
+
+#include "pair_gran_hooke.h"
+
+namespace LAMMPS_NS {
+
+class PairGranHookeCuda : public PairGranHooke
+{
+	public:
+		PairGranHookeCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj96_cut_cuda.cpp
+++ b/src/USER-CUDA/pair_lj96_cut_cuda.cpp
@ -0,0 +1,184 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj96_cut_cuda.h"
+#include "pair_lj96_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJ96CutCuda::PairLJ96CutCuda(LAMMPS *lmp) : PairLJ96Cut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::allocate()
+{
+	if(! allocated) PairLJ96Cut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+	
+	Cuda_PairLJ96CutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::settings(int narg, char **arg)
+{
+	PairLJ96Cut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJ96CutCuda::coeff(int narg, char **arg)
+{
+	PairLJ96Cut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJ96CutCuda::init_style()
+{
+	MYDBG(printf("# CUDA PairLJ96CutCuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  if (update->whichflag == 0 && strcmp(update->integrate_style,"respa") == 0) {
+
+  } 
+  else 
+  {
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  }
+
+
+  cut_respa = NULL;
+  MYDBG(printf("# CUDA PairLJ96CutCuda::init_style end\n"); )
+}
+
+void PairLJ96CutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJ96CutCuda::init_list\n");)
+	PairLJ96Cut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJ96CutCuda::init_list end\n");)
+}
+
+void PairLJ96CutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJ96Cut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj96_cut_cuda.h
+++ b/src/USER-CUDA/pair_lj96_cut_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj96/cut/cuda,PairLJ96CutCuda)
+
+#else
+
+#ifndef PAIR_LJ96_CUT_CUDA_H
+#define PAIR_LJ96_CUT_CUDA_H
+
+#include "pair_lj96_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJ96CutCuda : public PairLJ96Cut
+{
+	public:
+		PairLJ96CutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.cpp
@ -0,0 +1,193 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_charmm_cuda.h"
+#include "pair_lj_charmm_coul_charmm_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmCuda::PairLJCharmmCoulCharmmCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmm(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.use_block_per_atom = 0;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::allocate()
+{
+	if(! allocated) PairLJCharmmCoulCharmm::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::compute(int eflag, int vflag)
+{
+  	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+
+	Cuda_PairLJCharmmCoulCharmmCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul);
+	
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::settings(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmm::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj;
+	cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmCuda::coeff(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmm::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCharmmCoulCharmmCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+	if(atom->molecular)
+	{
+	  cuda->shared_data.pair.collect_forces_later = 1;
+	}
+	
+  int irequest;
+ 
+   	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+
+   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
+    error->all("Pair inner cutoff >= Pair outer cutoff");
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
+    (cut_ljsq-cut_lj_innersq);
+  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) * 
+    (cut_coulsq-cut_coul_innersq);
+
+  cut_coulsq = cut_coul * cut_coul;
+ 
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+}
+
+void PairLJCharmmCoulCharmmCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list\n");)
+	PairLJCharmmCoulCharmm::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmCuda::init_list end\n");)
+}
+
+void PairLJCharmmCoulCharmmCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCharmmCoulCharmm::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_cuda.h
@ -0,0 +1,63 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/cuda,PairLJCharmmCoulCharmmCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_CUDA_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmCuda : public PairLJCharmmCoulCharmm
+{
+	public:
+		PairLJCharmmCoulCharmmCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.cpp
@ -0,0 +1,188 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_charmm_implicit_cuda.h"
+#include "pair_lj_charmm_coul_charmm_implicit_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmImplicitCuda::PairLJCharmmCoulCharmmImplicitCuda(LAMMPS *lmp) : PairLJCharmmCoulCharmmImplicit(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.collect_forces_later = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::allocate()
+{
+	if(! allocated) PairLJCharmmCoulCharmmImplicit::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+
+	Cuda_PairLJCharmmCoulCharmmImplicitCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj,cut_coul_innersq,denom_coul);
+
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::settings(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmmImplicit::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj;
+	cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmImplicitCuda::coeff(int narg, char **arg)
+{
+	PairLJCharmmCoulCharmmImplicit::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCharmmCoulCharmmImplicitCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+   	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+
+   if (cut_lj_inner >= cut_lj || cut_coul_inner >= cut_coul)
+    error->all("Pair inner cutoff >= Pair outer cutoff");
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
+    (cut_ljsq-cut_lj_innersq);
+  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) * 
+    (cut_coulsq-cut_coul_innersq);
+
+  cut_coulsq = cut_coul * cut_coul;
+
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+}
+
+void PairLJCharmmCoulCharmmImplicitCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list\n");)
+	PairLJCharmmCoulCharmmImplicit::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCharmmCoulCharmmImplicitCuda::init_list end\n");)
+}
+
+void PairLJCharmmCoulCharmmImplicitCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCharmmCoulCharmmImplicit::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h
+++ b/src/USER-CUDA/pair_lj_charmm_coul_charmm_implicit_cuda.h
@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/implicit/cuda,PairLJCharmmCoulCharmmImplicitCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_IMPLICIT_CUDA_H
+
+#include "pair_lj_charmm_coul_charmm_implicit.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmImplicitCuda : public PairLJCharmmCoulCharmmImplicit
+{
+	public:
+		PairLJCharmmCoulCharmmImplicitCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.cpp
@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_long_cuda.h"
+#include "pair_lj_charmm_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongCuda::PairLJCharmmCoulLongCuda(LAMMPS *lmp) : PairLJCharmmCoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->shared_data.pair.collect_forces_later = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::allocate()
+{
+	if(! allocated) PairLJCharmmCoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		//cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	    cu_lj1_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj1, &cuda->shared_data.pair.coeff1_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj2_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj2, &cuda->shared_data.pair.coeff2_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj3_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj3, &cuda->shared_data.pair.coeff3_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	    cu_lj4_gm = new cCudaData<double, F_FLOAT, x> ((double*)lj4, &cuda->shared_data.pair.coeff4_gm, (atom->ntypes+1)*(atom->ntypes+1));
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->upload();
+	  if(eflag) cuda->cu_eng_coul->upload();
+	  if(vflag) cuda->cu_virial->upload();
+	}
+
+	Cuda_PairLJCharmmCoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom,denom_lj);
+
+	if(not cuda->shared_data.pair.collect_forces_later)
+	{
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::settings(int narg, char **arg)
+{
+	PairLJCharmmCoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (X_FLOAT) cut_lj;
+	cuda->shared_data.pair.cut_coulsq_global = (X_FLOAT) cut_coulsq;
+	cuda->shared_data.pair.cut_inner_global = (F_FLOAT) cut_lj_inner;	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongCuda::coeff(int narg, char **arg)
+{
+	PairLJCharmmCoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCharmmCoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/charmm/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1; 
+ 
+  if (cut_lj_inner >= cut_lj) 
+    error->all("Pair inner cutoff >= Pair outer cutoff");
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) * 
+    (cut_ljsq-cut_lj_innersq);
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairLJCharmmCoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list\n");)
+	PairLJCharmmCoulLong::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCharmmCoulLongCuda::init_list end\n");)
+}
+
+void PairLJCharmmCoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCharmmCoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h
+++ b/src/USER-CUDA/pair_lj_charmm_coul_long_cuda.h
@ -0,0 +1,62 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/long/cuda,PairLJCharmmCoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H
+#define LMP_PAIR_LJ_CHARMM_COUL_LONG_CUDA_H
+
+#include "pair_lj_charmm_coul_long.h"
+#include "cuda_data.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulLongCuda : public PairLJCharmmCoulLong
+{
+	public:
+		PairLJCharmmCoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+		cCudaData<double  , F_FLOAT , x >* cu_lj1_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj2_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj3_gm;
+		cCudaData<double  , F_FLOAT , x >* cu_lj4_gm;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.cpp
@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_class2_coul_cut_cuda.h"
+#include "pair_lj_class2_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2CoulCutCuda::PairLJClass2CoulCutCuda(LAMMPS *lmp) : PairLJClass2CoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::allocate()
+{
+	if(! allocated) PairLJClass2CoulCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJClass2CoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::settings(int narg, char **arg)
+{
+	PairLJClass2CoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulCutCuda::coeff(int narg, char **arg)
+{
+	PairLJClass2CoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJClass2CoulCutCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/cut/cuda requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+ 
+  irequest = neighbor->request(this);
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+}
+
+void PairLJClass2CoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list\n");)
+	PairLJClass2CoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJClass2CoulCutCuda::init_list end\n");)
+}
+
+void PairLJClass2CoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJClass2CoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h
+++ b/src/USER-CUDA/pair_lj_class2_coul_cut_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/coul/cut/cuda,PairLJClass2CoulCutCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H
+#define LMP_PAIR_LJ_CLASS2_COUL_CUT_CUDA_H
+
+#include "pair_lj_class2_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2CoulCutCuda : public PairLJClass2CoulCut
+{
+	public:
+		PairLJClass2CoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.cpp
@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_class2_coul_long_cuda.h"
+#include "pair_lj_class2_coul_long_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2CoulLongCuda::PairLJClass2CoulLongCuda(LAMMPS *lmp) : PairLJClass2CoulLong(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::allocate()
+{
+	if(! allocated) PairLJClass2CoulLong::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJClass2CoulLongCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::settings(int narg, char **arg)
+{
+	PairLJClass2CoulLong::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2CoulLongCuda::coeff(int narg, char **arg)
+{
+	PairLJClass2CoulLong::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJClass2CoulLongCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/long requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+
+  cut_coulsq = cut_coul * cut_coul;
+  cuda->shared_data.pair.cut_coul_global=cut_coul;
+  cuda->shared_data.pair.cut_coulsq_global=cut_coulsq;
+  // set rRESPA cutoffs
+
+  if (force->newton) error->warning("Pair style uses does not use \"newton\" setting. You might test if \"newton off\" makes the simulation run faster.");
+  if (force->kspace == NULL)
+    error->all("Pair style is incompatible with KSpace style");
+  g_ewald = force->kspace->g_ewald;
+  cuda->shared_data.pair.g_ewald=g_ewald;
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+
+  if(ncoultablebits) error->warning("# CUDA: You asked for the useage of Coulomb Tables. This is not supported in CUDA Pair forces. Setting is ignored.\n");
+}
+
+void PairLJClass2CoulLongCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list\n");)
+	PairLJClass2CoulLong::init_list(id, ptr);
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	MYDBG(printf("# CUDA PairLJClass2CoulLongCuda::init_list end\n");)
+}
+
+void PairLJClass2CoulLongCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJClass2CoulLong::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h
+++ b/src/USER-CUDA/pair_lj_class2_coul_long_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/coul/long/cuda,PairLJClass2CoulLongCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H
+#define LMP_PAIR_LJ_CLASS2_COUL_LONG_CUDA_H
+
+#include "pair_lj_class2_coul_long.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2CoulLongCuda : public PairLJClass2CoulLong
+{
+	public:
+		PairLJClass2CoulLongCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_class2_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_class2_cuda.cpp
@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Paul Crozier (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_class2_cuda.h"
+#include "pair_lj_class2_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJClass2Cuda::PairLJClass2Cuda(LAMMPS *lmp) : PairLJClass2(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::allocate()
+{
+	if(! allocated) PairLJClass2::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJClass2Cuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+	
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::settings(int narg, char **arg)
+{
+	PairLJClass2::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2Cuda::coeff(int narg, char **arg)
+{
+	PairLJClass2::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJClass2Cuda::init_style()
+{
+	MYDBG(printf("# CUDA PairLJClass2Cuda::init_style start\n"); )
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+  	irequest = neighbor->request(this);
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->cudable = 1;
+    //neighbor->style=0; //0=NSQ neighboring
+  MYDBG(printf("# CUDA PairLJClass2Cuda::init_style end\n"); )
+}
+
+void PairLJClass2Cuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJClass2Cuda::init_list\n");)
+	PairLJClass2::init_list(id, ptr);
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	MYDBG(printf("# CUDA PairLJClass2Cuda::init_list end\n");)
+}
+
+void PairLJClass2Cuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJClass2::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj_class2_cuda.h
+++ b/src/USER-CUDA/pair_lj_class2_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/class2/cuda,PairLJClass2Cuda)
+
+#else
+
+#ifndef PAIR_LJ_CLASS2_CUDA_H
+#define PAIR_LJ_CLASS2_CUDA_H
+
+#include "pair_lj_class2.h"
+
+namespace LAMMPS_NS {
+
+class PairLJClass2Cuda : public PairLJClass2
+{
+	public:
+		PairLJClass2Cuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.cpp
@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_cut_coul_cut_cuda.h"
+#include "pair_lj_cut_coul_cut_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulCutCuda::PairLJCutCoulCutCuda(LAMMPS *lmp) : PairLJCutCoulCut(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::allocate()
+{
+	if(! allocated) PairLJCutCoulCut::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJCutCoulCutCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::settings(int narg, char **arg)
+{
+	PairLJCutCoulCut::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulCutCuda::coeff(int narg, char **arg)
+{
+	PairLJCutCoulCut::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCutCoulCutCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/cut/cuda requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+ 
+  irequest = neighbor->request(this);
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+}
+
+void PairLJCutCoulCutCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list\n");)
+	PairLJCutCoulCut::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCutCoulCutCuda::init_list end\n");)
+}
+
+void PairLJCutCoulCutCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCutCoulCut::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h
+++ b/src/USER-CUDA/pair_lj_cut_coul_cut_cuda.h
@ -0,0 +1,57 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/cut/cuda,PairLJCutCoulCutCuda)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H
+#define LMP_PAIR_LJ_CUT_COUL_CUT_CUDA_H
+
+#include "pair_lj_cut_coul_cut.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulCutCuda : public PairLJCutCoulCut
+{
+	public:
+		PairLJCutCoulCutCuda(class LAMMPS *);
+		void compute(int, int);
+		void settings(int, char **);
+		void coeff(int, char **);
+		void init_list(int, class NeighList *);
+		void init_style();
+		void ev_setup(int eflag, int vflag);
+	protected:
+		class Cuda *cuda;
+		void allocate();
+		bool allocated2;
+		class CudaNeighList* cuda_neigh_list;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp
+++ b/src/USER-CUDA/pair_lj_cut_coul_debye_cuda.cpp
@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 
+
+   Original Version:
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov 
+
+   See the README file in the top-level LAMMPS directory. 
+
+   Contributing author: Paul Crozier (SNL)
+   ----------------------------------------------------------------------- 
+
+   USER-CUDA Package and associated modifications:
+   https://sourceforge.net/projects/lammpscuda/ 
+
+   Christian Trott, christian.trott@tu-ilmenau.de
+   Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
+   Theoretical Physics II, University of Technology Ilmenau, Germany 
+
+   See the README file in the USER-CUDA directory. 
+
+   This software is distributed under the GNU General Public License.
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_cut_coul_debye_cuda.h"
+#include "pair_lj_cut_coul_debye_cuda_cu.h"
+#include "cuda_data.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "kspace.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "cuda_neigh_list.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
+#include "memory.h"
+#include "error.h"
+#include "cuda.h"
+
+using namespace LAMMPS_NS;
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulDebyeCuda::PairLJCutCoulDebyeCuda(LAMMPS *lmp) : PairLJCutCoulDebye(lmp)
+{
+  cuda = lmp->cuda;
+   if(cuda == NULL)
+        error->all("You cannot use a /cuda class, without activating 'cuda' acceleration. Use no '-a' command line argument, or '-a cuda'.");
+
+	allocated2 = false;
+	cuda->shared_data.pair.cudable_force = 1;
+	cuda->setSystemParams();
+}
+
+/* ----------------------------------------------------------------------
+   remember pointer to arrays in cuda shared data
+------------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::allocate()
+{
+	if(! allocated) PairLJCutCoulDebye::allocate();
+	if(! allocated2)
+	{
+		allocated2 = true;
+		cuda->shared_data.pair.cut     = cut_lj;
+		cuda->shared_data.pair.cut_coul= cut_coul;
+		cuda->shared_data.pair.coeff1  = lj1;
+		cuda->shared_data.pair.coeff2  = lj2;
+		cuda->shared_data.pair.coeff3  = lj3;
+		cuda->shared_data.pair.coeff4  = lj4;
+		cuda->shared_data.pair.offset  = offset;
+		cuda->shared_data.pair.special_lj  = force->special_lj;
+		cuda->shared_data.pair.special_coul  = force->special_coul;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::compute(int eflag, int vflag)
+{
+	if (eflag || vflag) ev_setup(eflag,vflag);
+	if(eflag) cuda->cu_eng_vdwl->upload();
+	if(eflag) cuda->cu_eng_coul->upload();
+	if(vflag) cuda->cu_virial->upload();
+
+	Cuda_PairLJCutCoulDebyeCuda(& cuda->shared_data, & cuda_neigh_list->sneighlist, eflag, vflag, eflag_atom, vflag_atom);
+
+    if(not cuda->shared_data.pair.collect_forces_later)
+    {
+	  if(eflag) cuda->cu_eng_vdwl->download();
+	  if(eflag) cuda->cu_eng_coul->download();
+	  if(vflag) cuda->cu_virial->download();
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::settings(int narg, char **arg)
+{
+	PairLJCutCoulDebye::settings(narg, arg);
+	cuda->shared_data.pair.cut_global = (F_FLOAT) cut_lj_global;
+	cuda->shared_data.pair.cut_coul_global = (F_FLOAT) cut_coul_global;
+	cuda->shared_data.pair.kappa = (F_FLOAT) kappa;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDebyeCuda::coeff(int narg, char **arg)
+{
+	PairLJCutCoulDebye::coeff(narg, arg);
+	allocate();
+}
+
+void PairLJCutCoulDebyeCuda::init_style()
+{
+  if (!atom->q_flag)
+    error->all("Pair style lj/cut/coul/debye/cuda requires atom attribute q");
+  // request regular or rRESPA neighbor lists
+
+  int irequest;
+ 
+ 
+  irequest = neighbor->request(this);
+  neighbor->requests[irequest]->full = 1;
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->cudable = 1;
+ 
+
+  cuda->shared_data.pppm.qqrd2e=force->qqrd2e;
+  
+}
+
+void PairLJCutCoulDebyeCuda::init_list(int id, NeighList *ptr)
+{
+	MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list\n");)
+	PairLJCutCoulDebye::init_list(id, ptr);
+	#ifndef CUDA_USE_BINNING
+	// right now we can only handle verlet (id 0), not respa
+	if(id == 0) cuda_neigh_list = cuda->registerNeighborList(ptr);
+	// see Neighbor::init() for details on lammps lists' logic
+	#endif
+	MYDBG(printf("# CUDA PairLJCutCoulDebyeCuda::init_list end\n");)
+}
+
+void PairLJCutCoulDebyeCuda::ev_setup(int eflag, int vflag)
+{
+	int maxeatomold=maxeatom;
+	PairLJCutCoulDebye::ev_setup(eflag,vflag);
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_eatom; cuda->cu_eatom = new cCudaData<double, ENERGY_FLOAT, x > ((double*)eatom, & cuda->shared_data.atom.eatom , atom->nmax  );}
+
+  if (eflag_atom && atom->nmax > maxeatomold) 
+	{delete cuda->cu_vatom; cuda->cu_vatom = new cCudaData<double, ENERGY_FLOAT, yx > ((double*)vatom, & cuda->shared_data.atom.eatom , atom->nmax, 6  );}
+	
+}
+
+
--- a/Show More
+++ b/Show More